From 351f4318d007ca1329f2f27983af77a97993f72f Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Mon, 6 May 2024 15:36:45 -0700
Subject: [PATCH 01/49] Llamaish7 config, finally

---
 configs/llamaish7-s3.yaml          | 1284 ++++++++++++++++++++++++++++
 scripts/beaker/llamaish7-launch.sh |   32 +
 scripts/beaker/llamaish7.sh        |   43 +
 3 files changed, 1359 insertions(+)
 create mode 100644 configs/llamaish7-s3.yaml
 create mode 100755 scripts/beaker/llamaish7-launch.sh
 create mode 100755 scripts/beaker/llamaish7.sh

diff --git a/configs/llamaish7-s3.yaml b/configs/llamaish7-s3.yaml
new file mode 100644
index 000000000..5d52eeef6
--- /dev/null
+++ b/configs/llamaish7-s3.yaml
@@ -0,0 +1,1284 @@
+run_name: llamaish7-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-medium
+  group: llamaish7
+
+model:
+  d_model: 4096
+  n_heads: 32
+  n_layers: 32
+  # mlp_ratio: 6
+  mlp_hidden_size: 22016
+  weight_tying: false
+  alibi: false
+  rope: true
+  flash_attention: true
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  clip_qkv: 8.0
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: default
+  layer_norm_with_affine: false
+  bias_for_layer_norm: false
+  attention_layer_norm_with_affine: false
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 4096
+  vocab_size: 50280
+  embedding_size: 50304
+  eos_token_id: 0
+  pad_token_id: 1
+  init_device: meta
+  init_fn: full_megatron
+  init_std: 0.006
+  init_cutoff_factor: 3
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 3.0e-4
+  weight_decay: 0.1
+  decay_norm_and_bias: true
+  decay_embeddings: false
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 10
+
+scheduler:
+  name: cosine_with_warmup
+  units: tokens
+  t_warmup: 20971520000
+  t_max: 3e12
+  alpha_f: 0.1
+  grad_clip_warmup_steps: 2097152000
+  grad_clip_warmup_factor: 5
+
+tokenizer:
+  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
+  truncate_direction: right
+
+save_folder: runs/${run_name}
+remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name}
+save_overwrite: false
+
+save_interval: 1000
+save_interval_ephemeral: null
+save_num_checkpoints_to_keep: -1
+sharded_checkpointer: olmo_core
+
+save_interval_unsharded: null
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+max_duration: 2ep
+global_train_batch_size: 512
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+fsdp:
+  wrapping_strategy: by_block_and_size
+  precision: mixed
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 1
+
+eval_interval: 1000
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  - label: all-small-ppl-validation
+    data:
+      num_workers: 0
+      drop_last: true
+      datasets:
+        c4_en-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy
+        dolma_books-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy
+        dolma_common-crawl-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy
+        dolma_pes2o-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy
+        dolma_reddit-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy
+        dolma_stack-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy
+        dolma_wiki-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy
+        ice-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy
+        m2d2_s2orc-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy
+        pile-validation:
+        - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy
+        wikitext_103-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  - label: boolq
+    type: downstream
+ 
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  - label: arc_challenge
+    type: downstream
+
+  - label: copa
+    type: downstream
+
+  #- label: rte
+  #  type: downstream
+
+  #- label: commitment_bank
+  #  type: downstream
+
+  #- label: sst2
+  #  type: downstream
+
+  - label: commonsense_qa
+    type: downstream
+
+  - label: social_iqa
+    type: downstream
+
+  # Doesn't work from cache.
+  # - label: basic_arithmetic
+  #   type: downstream
+
+  - label: mmlu_stem_var
+    type: downstream
+
+  - label: mmlu_humanities_var
+    type: downstream
+
+  - label: mmlu_social_sciences_var
+    type: downstream
+
+  - label: mmlu_other_var
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot
+    type: downstream
+
+  - label: mmlu_other_mc_5shot
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_other_mc_5shot_test
+    type: downstream
+
+data:
+  pad_direction: right
+  num_workers: 32
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 8
+  persistent_workers: true
+  timeout: 0
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
+  paths:
+    ######### NON WEB DATA #########
+    # ~> GUTENBERG BOOKS (5.256 GT)
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy
+    # ~> PES2O STEM PAPERS (57.21 GT)
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/pes2o/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
+    # MEGAWIKA v1 (4.6 GT)
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-01-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-02-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-23-00002.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/megawika/v1/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy
+    # ~> REDPAJAMA STACK-EXCHANGE (19.63 GT)
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/stackexchange/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    # ~> REDPAJAMA ARXIV (27.97 GT)
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/redpajama_v1_decon_fix/arxiv/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy
+    # ~> PROOFPILE2 ALGEBRAIC STACK (12.623 GT)
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    # ~> PROOFPILE2 OPENWEBMATH (12.734 GT)
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    # ~> TULU FLAN V1 (16.5 G v2-decontaminated-60M-shots_all-upweight_1-dialog_true-sep_newline)
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/tulu_flan/v2-decontaminated-60M-shots_all-upweight_1-dialog_false-sep_newline/train/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy
+    # ~> CC NEWS (14.3 GT)
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-0-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-1-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-2-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-3-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-4-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-5-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-6-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00002.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-7-00003.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00000.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00001.npy
+    - s3://ai2-llm/preprocessed/cc-news/v3/gpt-neox-olmo-dolma-v1_5/part-8-00002.npy
+    ####################################
+    ######### CODE #########
+    # ~> STARCODER (263.775 GT)
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-00-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-03-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-04-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-05-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-06-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-07-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-08-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-09-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-10-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-11-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-12-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-13-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-14-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-15-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-16-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-17-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-18-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-19-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-20-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-21-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-22-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-23-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-24-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-25-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-26-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-27-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-30-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-31-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-32-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-33-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-34-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-35-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-36-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-37-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-38-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-39-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-40-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-41-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-42-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-43-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-44-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-46-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-47-00001.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/starcoder/v0_decontaminated_doc_only/gpt-neox-olmo-dolma-v1_5/part-48-00001.npy
+    ####################################
+    ######### WEB HIGH QUALITY #########
+    # ~> C4 (138.4 GT)
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy
+    - s3://ai2-llm/preprocessed/c4/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001-fix/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy
+    # ~> REDDIT (79.9 GT)
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-00-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-01-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-02-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-03-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-04-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-05-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-06-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-07-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-08-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-09-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-10-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-11-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-12-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-13-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-14-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-15-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-16-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-17-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-18-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-19-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-20-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-21-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-22-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-23-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-24-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-25-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-26-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-27-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-28-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-29-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-30-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-31-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-32-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-33-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-34-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-35-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-36-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-37-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-38-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-39-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-40-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-41-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-42-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-43-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-44-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-45-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-46-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-47-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-48-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-49-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-50-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-51-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-52-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-53-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-54-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-55-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-56-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-57-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-58-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-59-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-60-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-61-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-62-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-63-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-64-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-65-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-66-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-67-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-68-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-69-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-70-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-71-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-72-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-73-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-74-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-75-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-76-00000.npy
+    - s3://ai2-llm/preprocessed/reddit/v5-dedupe-pii-nsfw-toxic-fuzzydd-length/gpt-neox-olmo-dolma-v1_5/part-77-00000.npy
+    # ~> FALCON (547.341 GT)
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-093-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-094-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-095-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-096-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-097-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-098-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-099-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-100-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-101-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-102-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-103-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-104-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-105-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-106-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-107-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-108-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-109-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-110-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-111-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-112-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-113-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-114-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-115-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-116-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-117-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-118-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-119-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-120-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-121-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-122-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-123-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-124-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-125-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-126-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-127-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-128-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-129-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-130-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-131-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-132-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-133-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-134-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-135-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-136-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-137-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-138-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-139-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-140-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-141-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-142-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-143-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-144-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-145-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-146-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-147-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-148-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-149-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-150-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-151-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-152-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-153-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-154-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-155-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-156-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-157-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-158-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-159-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-160-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-161-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-162-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-163-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-164-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-165-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-166-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-167-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-168-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-169-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-170-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-171-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-172-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-173-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-174-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-175-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-176-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-177-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-178-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-179-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-180-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-181-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-182-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-183-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-184-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-185-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-186-00000.npy
+    - s3://ai2-llm/preprocessed/falcon-refinedweb/v2-frac_005_100-qc_cc_multi_bin-paloma-rep-pii/gpt-neox-olmo-dolma-v1_5/part-187-00000.npy
+    ####################################
+    ######### WEB REST #########
+    # ~> DOLMA CC HEAD 50% (178.4 GT)
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-006-00001.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_head/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy
+    # ~> DOLMA CC MIDDLE 33% (242.05 GT)
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_middle/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy
+    # ~> DOLMA CC TAIL 33% (191.4 GT)
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-000-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-001-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-002-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-003-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-004-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-005-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-006-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-007-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-008-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-009-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-010-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-011-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-012-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-013-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-014-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-015-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-016-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-017-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-018-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-019-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-020-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-021-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-022-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-023-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-024-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-025-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-026-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-027-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-028-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-029-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-030-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-031-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-032-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-033-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-034-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-035-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-036-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-037-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-038-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-039-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-040-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-041-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-042-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-043-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-044-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-045-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-046-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-047-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-048-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-049-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-050-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-051-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-052-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-053-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-054-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-055-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-056-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-057-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-058-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-059-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-060-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-061-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-062-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-063-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-064-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-065-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-066-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-067-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-068-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-069-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-070-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-071-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-072-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-073-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-074-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-075-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-076-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-077-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-078-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-079-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-080-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-081-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-082-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-083-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-084-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-085-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-086-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-087-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-088-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-089-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-090-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-091-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_7-dd_ngram_dp_030-qc_cc_en_bin_001/cc_en_tail/gpt-neox-olmo-dolma-v1_5/part-092-00000.npy
diff --git a/scripts/beaker/llamaish7-launch.sh b/scripts/beaker/llamaish7-launch.sh
new file mode 100755
index 000000000..e7ced5ba6
--- /dev/null
+++ b/scripts/beaker/llamaish7-launch.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=8
+
+gantry run \
+  --workspace ai2/dirkg \
+  --task-name llamaish7 \
+  --description "OLMo medium - 7B - Llamaish" \
+  --priority high \
+  --beaker-image shanea/olmo-torch2.2-gantry \
+  --cluster ai2/pluto-cirrascale \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --propagate-failure \
+  --synchronized-start-timeout 10m \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env-secret WANDB_API_KEY=WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
+  --shared-memory 10GiB \
+  --venv base \
+  --yes \
+  --timeout=-1 \
+  -- /bin/bash -c "scripts/beaker/llamaish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
diff --git a/scripts/beaker/llamaish7.sh b/scripts/beaker/llamaish7.sh
new file mode 100755
index 000000000..332dfb0be
--- /dev/null
+++ b/scripts/beaker/llamaish7.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -exuo pipefail
+IFS=$'\n\t'
+
+BEAKER_LEADER_REPLICA_HOSTNAME=$1
+shift
+
+NUM_NODES=$1
+shift
+
+BEAKER_REPLICA_RANK=$1
+shift
+
+# Warm HF cache
+mkdir -p /root/.cache
+pushd /root/.cache
+curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf -
+popd
+export HF_DATASETS_OFFLINE=1
+
+torchrun \
+  --nnodes ${NUM_NODES}:${NUM_NODES} \
+  --nproc-per-node 8 \
+  --rdzv_id=12347 \
+  --rdzv_backend=static \
+  --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \
+  --node_rank=$BEAKER_REPLICA_RANK \
+  --rdzv_conf="read_timeout=420" \
+  scripts/train.py \
+  configs/llamaish7-s3.yaml \
+    --run_name=llamaish7 \
+    --wandb.name=llamaish7 \
+    --model.flash_attention=true \
+    --fsdp.wrapping_strategy=by_block_and_size \
+    --fsdp.sharding_strategy=SHARD_GRAD_OP \
+    --save_folder=runs/ \
+    --activation_checkpointing=fine_grained \
+    --fused_loss=true \
+    --device_train_microbatch_size=2 \
+    --global_train_batch_size=1024
+    
+    #--save_overwrite \
+    #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7/}'
\ No newline at end of file

From 4d3afa63c451ea89e034bbae092adcec7ab6f7c9 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Tue, 7 May 2024 16:02:39 -0700
Subject: [PATCH 02/49] Create way more detailed artifacts

---
 scripts/beaker/llamaish7.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/beaker/llamaish7.sh b/scripts/beaker/llamaish7.sh
index 332dfb0be..6115fe71b 100755
--- a/scripts/beaker/llamaish7.sh
+++ b/scripts/beaker/llamaish7.sh
@@ -28,8 +28,9 @@ torchrun \
   --rdzv_conf="read_timeout=420" \
   scripts/train.py \
   configs/llamaish7-s3.yaml \
-    --run_name=llamaish7 \
-    --wandb.name=llamaish7 \
+    --run_name=llamaish7-detailed \
+    --wandb.name=llamaish7-detailed \
+    --wandb.group=llamaish7-detailed \
     --model.flash_attention=true \
     --fsdp.wrapping_strategy=by_block_and_size \
     --fsdp.sharding_strategy=SHARD_GRAD_OP \
@@ -37,7 +38,10 @@ torchrun \
     --activation_checkpointing=fine_grained \
     --fused_loss=true \
     --device_train_microbatch_size=2 \
-    --global_train_batch_size=1024
-    
+    --global_train_batch_size=1024 \
+    --save_interval=50 \
+    --eval_interval=50 \
+    --optimizer.metrics_log_interval=1
+
     #--save_overwrite \
     #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7/}'
\ No newline at end of file

From 732b397a7f112d70a1e86e3e086fd2eab3a34091 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Wed, 8 May 2024 10:27:45 -0700
Subject: [PATCH 03/49] Plan C: backtrack, cut LR in half

---
 configs/mcli/mitchish70.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
index db7b637ab..99bcb9cdb 100644
--- a/configs/mcli/mitchish70.yaml
+++ b/configs/mcli/mitchish70.yaml
@@ -62,11 +62,12 @@ command: |-
   --node_rank "$NODE_RANK" \
   --nproc_per_node 8 \
   scripts/train.py configs/mitchish70-s3.yaml \
-    --run_name=mitchish70-planc \
-    --wandb.group=mitchish70-planc \
-    '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
+    --run_name=mitchish70-pland \
+    '--wandb.group=${run_name}' \
+    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planc/step197000 \
     --load_path_sharded_checkpointer=olmo_core \
     --sharded_checkpointer=olmo_core \
+    --optimizer.learning_rate=0.000075 \
     --global_train_batch_size=3584 \
     --device_train_microbatch_size=4 \
     --fsdp.sharding_strategy=HYBRID_SHARD \

From 59034d658f9036452ce07a845bf1adf9ea168d6e Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Wed, 8 May 2024 10:48:44 -0700
Subject: [PATCH 04/49] fix branch

---
 configs/mcli/mitchish70.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
index 99bcb9cdb..f83aa4f08 100644
--- a/configs/mcli/mitchish70.yaml
+++ b/configs/mcli/mitchish70.yaml
@@ -15,7 +15,7 @@ compute:
 integrations:
   - integration_type: git_repo
     git_repo: allenai/OLMo
-    git_branch: epwalsh/train-olmo-large
+    git_branch: train-olmo-large
     pip_install: -e .[train]
     ssh_clone: true
   - integration_type: git_repo

From 7015172529ae34481db36947d713ab1c2bbc47fe Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Wed, 8 May 2024 11:16:40 -0700
Subject: [PATCH 05/49] pile llamaish

---
 configs/pile-llamaish-s3.yaml           | 528 ++++++++++++++++++++++++
 scripts/beaker/pile-llamaish7-launch.sh |  32 ++
 scripts/beaker/pile-llamaish7.sh        |  47 +++
 3 files changed, 607 insertions(+)
 create mode 100644 configs/pile-llamaish-s3.yaml
 create mode 100755 scripts/beaker/pile-llamaish7-launch.sh
 create mode 100755 scripts/beaker/pile-llamaish7.sh

diff --git a/configs/pile-llamaish-s3.yaml b/configs/pile-llamaish-s3.yaml
new file mode 100644
index 000000000..bf96bd5c4
--- /dev/null
+++ b/configs/pile-llamaish-s3.yaml
@@ -0,0 +1,528 @@
+run_name: pile-llamaish7-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-medium
+  group: pile-llamaish7
+
+model:
+  d_model: 4096
+  n_heads: 32
+  n_layers: 32
+  # mlp_ratio: 6
+  mlp_hidden_size: 22016
+  weight_tying: false
+  alibi: false
+  rope: true
+  flash_attention: true
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  clip_qkv: 8.0
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: default
+  layer_norm_with_affine: false
+  bias_for_layer_norm: false
+  attention_layer_norm_with_affine: false
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 4096
+  vocab_size: 50280
+  embedding_size: 50304
+  eos_token_id: 0
+  pad_token_id: 1
+  init_device: meta
+  init_fn: full_megatron
+  init_std: 0.006
+  init_cutoff_factor: 3
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 3.0e-4
+  weight_decay: 0.1
+  decay_norm_and_bias: true
+  decay_embeddings: false
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 10
+
+scheduler:
+  name: cosine_with_warmup
+  units: tokens
+  t_warmup: 20971520000
+  t_max: 3e12
+  alpha_f: 0.1
+  grad_clip_warmup_steps: 2097152000
+  grad_clip_warmup_factor: 5
+
+tokenizer:
+  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
+  truncate_direction: right
+
+save_folder: runs/${run_name}
+remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name}
+save_overwrite: false
+
+save_interval: 1000
+save_interval_ephemeral: null
+save_num_checkpoints_to_keep: -1
+sharded_checkpointer: olmo_core
+
+save_interval_unsharded: null
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+max_duration: 2ep
+global_train_batch_size: 512
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+fsdp:
+  wrapping_strategy: by_block_and_size
+  precision: mixed
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 1
+
+eval_interval: 1000
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  - label: all-small-ppl-validation
+    data:
+      num_workers: 0
+      drop_last: true
+      datasets:
+        c4_en-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy
+        dolma_books-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy
+        dolma_common-crawl-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy
+        dolma_pes2o-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy
+        dolma_reddit-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy
+        dolma_stack-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy
+        dolma_wiki-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy
+        ice-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy
+        m2d2_s2orc-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy
+        pile-validation:
+        - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy
+        wikitext_103-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  - label: boolq
+    type: downstream
+ 
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  - label: arc_challenge
+    type: downstream
+
+  - label: copa
+    type: downstream
+
+  #- label: rte
+  #  type: downstream
+
+  #- label: commitment_bank
+  #  type: downstream
+
+  #- label: sst2
+  #  type: downstream
+
+  - label: commonsense_qa
+    type: downstream
+
+  - label: social_iqa
+    type: downstream
+
+  # Doesn't work from cache.
+  # - label: basic_arithmetic
+  #   type: downstream
+
+  - label: mmlu_stem_var
+    type: downstream
+
+  - label: mmlu_humanities_var
+    type: downstream
+
+  - label: mmlu_social_sciences_var
+    type: downstream
+
+  - label: mmlu_other_var
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot
+    type: downstream
+
+  - label: mmlu_other_mc_5shot
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_other_mc_5shot_test
+    type: downstream
+
+data:
+  pad_direction: right
+  num_workers: 32
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 8
+  persistent_workers: true
+  timeout: 0
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
+  paths:
+    ######### PILE #########
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-000-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-000-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-001-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-001-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-002-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-002-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-003-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-003-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-004-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-004-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-005-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-005-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-006-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-006-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-007-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-007-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-008-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-008-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-009-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-009-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-010-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-010-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-011-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-011-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-012-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-012-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-013-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-013-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-014-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-014-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-015-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-015-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-016-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-016-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-017-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-017-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-018-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-018-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-019-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-019-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-020-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-020-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-021-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-021-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-022-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-022-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-023-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-023-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-024-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-024-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-025-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-025-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-026-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-026-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-027-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-027-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-028-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-028-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-029-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-029-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-030-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-030-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-031-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-031-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-032-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-032-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-033-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-033-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-034-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-034-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-035-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-035-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-036-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-036-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-037-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-037-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-038-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-038-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-039-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-039-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-040-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-040-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-041-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-041-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-042-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-042-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-043-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-043-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-044-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-044-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-045-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-045-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-046-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-046-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-047-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-047-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-048-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-048-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-049-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-049-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-050-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-050-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-051-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-051-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-052-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-052-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-053-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-053-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-054-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-054-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-055-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-055-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-056-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-056-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-057-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-057-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-058-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-058-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-059-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-059-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-060-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-060-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-061-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-061-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-062-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-062-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-063-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-063-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-064-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-064-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-065-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-065-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-066-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-066-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-067-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-067-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-068-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-068-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-069-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-069-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-070-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-070-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-071-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-071-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-072-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-072-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-073-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-073-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-074-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-074-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-075-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-075-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-076-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-076-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-077-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-077-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-078-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-078-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-079-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-079-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-080-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-080-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-081-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-081-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-082-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-082-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-083-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-083-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-084-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-084-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-085-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-085-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-086-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-086-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-087-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-087-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-088-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-088-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-089-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-089-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-090-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-090-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-091-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-091-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-092-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-092-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-093-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-093-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-094-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-094-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-095-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-095-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-096-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-096-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-097-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-097-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-098-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-098-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-099-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-099-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-100-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-100-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-101-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-101-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-102-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-102-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-103-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-103-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-104-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-104-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-105-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-105-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-106-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-106-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-107-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-107-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-108-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-108-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-109-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-109-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-110-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-110-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-111-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-111-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-112-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-112-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-113-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-113-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-114-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-114-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-115-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-115-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-116-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-116-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-117-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-117-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-118-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-118-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-119-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-119-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-120-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-120-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-121-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-121-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-122-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-122-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-123-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-123-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-124-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-124-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-125-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-125-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-126-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-126-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-127-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-127-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-128-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-128-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-129-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-129-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-130-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-130-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-131-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-131-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-132-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-132-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-133-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-133-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-134-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-134-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-135-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-135-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-136-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-136-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-137-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-137-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-138-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-138-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-139-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-139-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-140-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-140-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-141-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-141-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-142-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-142-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-143-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-143-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-144-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-144-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-145-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-145-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-146-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-146-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-147-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-147-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-148-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-148-00001.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-149-00000.npy
+    - s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special/part-149-00001.npy
diff --git a/scripts/beaker/pile-llamaish7-launch.sh b/scripts/beaker/pile-llamaish7-launch.sh
new file mode 100755
index 000000000..1a323f453
--- /dev/null
+++ b/scripts/beaker/pile-llamaish7-launch.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=8
+
+gantry run \
+  --workspace ai2/akshitab \
+  --task-name pile-llamaish7 \
+  --description "OLMo medium - 7B - Llamaish - Pile" \
+  --priority high \
+  --beaker-image shanea/olmo-torch2.2-gantry \
+  --cluster ai2/pluto-cirrascale \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --propagate-failure \
+  --synchronized-start-timeout 10m \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env-secret WANDB_API_KEY=WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
+  --shared-memory 10GiB \
+  --venv base \
+  --yes \
+  --timeout=-1 \
+  -- /bin/bash -c "scripts/beaker/pile-llamaish7.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
diff --git a/scripts/beaker/pile-llamaish7.sh b/scripts/beaker/pile-llamaish7.sh
new file mode 100755
index 000000000..46d5b26a2
--- /dev/null
+++ b/scripts/beaker/pile-llamaish7.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -exuo pipefail
+IFS=$'\n\t'
+
+BEAKER_LEADER_REPLICA_HOSTNAME=$1
+shift
+
+NUM_NODES=$1
+shift
+
+BEAKER_REPLICA_RANK=$1
+shift
+
+# Warm HF cache
+mkdir -p /root/.cache
+pushd /root/.cache
+curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf -
+popd
+export HF_DATASETS_OFFLINE=1
+
+torchrun \
+  --nnodes ${NUM_NODES}:${NUM_NODES} \
+  --nproc-per-node 8 \
+  --rdzv_id=12347 \
+  --rdzv_backend=static \
+  --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \
+  --node_rank=$BEAKER_REPLICA_RANK \
+  --rdzv_conf="read_timeout=420" \
+  scripts/train.py \
+  configs/pile-llamaish7-s3.yaml \
+    --run_name=pile-llamaish7 \
+    --wandb.name=pile-llamaish7 \
+    --wandb.group=pile-llamaish7 \
+    --model.flash_attention=true \
+    --fsdp.wrapping_strategy=by_block_and_size \
+    --fsdp.sharding_strategy=SHARD_GRAD_OP \
+    --save_folder=runs/ \
+    --activation_checkpointing=fine_grained \
+    --fused_loss=true \
+    --device_train_microbatch_size=2 \
+    --global_train_batch_size=1024 \
+    --save_interval=50 \
+    --eval_interval=50 \
+    --optimizer.metrics_log_interval=1
+
+    #--save_overwrite \
+    #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7/}'
\ No newline at end of file

From 26ba8646b167dd9728c3c998ce5e5a14a905eabb Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Wed, 8 May 2024 11:19:12 -0700
Subject: [PATCH 06/49] try on jupiter

---
 scripts/beaker/pile-llamaish7-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/pile-llamaish7-launch.sh b/scripts/beaker/pile-llamaish7-launch.sh
index 1a323f453..f2c628c2e 100755
--- a/scripts/beaker/pile-llamaish7-launch.sh
+++ b/scripts/beaker/pile-llamaish7-launch.sh
@@ -10,7 +10,7 @@ gantry run \
   --description "OLMo medium - 7B - Llamaish - Pile" \
   --priority high \
   --beaker-image shanea/olmo-torch2.2-gantry \
-  --cluster ai2/pluto-cirrascale \
+  --cluster ai2/jupiter-cirrascale \
   --gpus 8 \
   --replicas "${NUM_NODES}" \
   --leader-selection \

From 453fed1d93ad3def55f448803ad15a0d2b3bf588 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 8 May 2024 11:19:51 -0700
Subject: [PATCH 07/49] Resume the detailed llamaish run

---
 scripts/beaker/llamaish7.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/beaker/llamaish7.sh b/scripts/beaker/llamaish7.sh
index 6115fe71b..8bdd69a37 100755
--- a/scripts/beaker/llamaish7.sh
+++ b/scripts/beaker/llamaish7.sh
@@ -41,7 +41,6 @@ torchrun \
     --global_train_batch_size=1024 \
     --save_interval=50 \
     --eval_interval=50 \
-    --optimizer.metrics_log_interval=1
-
-    #--save_overwrite \
-    #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7/}'
\ No newline at end of file
+    --optimizer.metrics_log_interval=1 \
+    --save_overwrite \
+    '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7-detailed/}'

From 3ce4ec9955c9effc4ad3836ece18205173c2a30e Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Wed, 8 May 2024 11:33:30 -0700
Subject: [PATCH 08/49] fix filename

---
 configs/{pile-llamaish-s3.yaml => pile-llamaish7-s3.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename configs/{pile-llamaish-s3.yaml => pile-llamaish7-s3.yaml} (100%)

diff --git a/configs/pile-llamaish-s3.yaml b/configs/pile-llamaish7-s3.yaml
similarity index 100%
rename from configs/pile-llamaish-s3.yaml
rename to configs/pile-llamaish7-s3.yaml

From 05b8f39390fa52e985a0adb454ba64110269fe35 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Wed, 8 May 2024 11:42:31 -0700
Subject: [PATCH 09/49] run on pluto

---
 scripts/beaker/pile-llamaish7-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/pile-llamaish7-launch.sh b/scripts/beaker/pile-llamaish7-launch.sh
index f2c628c2e..1a323f453 100755
--- a/scripts/beaker/pile-llamaish7-launch.sh
+++ b/scripts/beaker/pile-llamaish7-launch.sh
@@ -10,7 +10,7 @@ gantry run \
   --description "OLMo medium - 7B - Llamaish - Pile" \
   --priority high \
   --beaker-image shanea/olmo-torch2.2-gantry \
-  --cluster ai2/jupiter-cirrascale \
+  --cluster ai2/pluto-cirrascale \
   --gpus 8 \
   --replicas "${NUM_NODES}" \
   --leader-selection \

From 391a53726b54e3c48493021f66bb1f4b7690ddb6 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Wed, 8 May 2024 13:27:10 -0700
Subject: [PATCH 10/49] whitelist nodes

---
 configs/mcli/mitchish70.yaml | 119 +++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
index f83aa4f08..63b1a58ed 100644
--- a/configs/mcli/mitchish70.yaml
+++ b/configs/mcli/mitchish70.yaml
@@ -12,6 +12,125 @@ compute:
   gpu_type: h100_80gb
   instance: oci.bm.gpu.h100.8
   # node_names:
+  #   - inst-jeel7-r15z3-workers
+  #   - inst-ubbqk-r15z3-workers
+  #   - inst-bsgg4-r15z3-workers
+  #   - inst-ivjqi-r15z3-workers
+  #   - inst-2oyig-r15z3-workers
+  #   - inst-5irk5-r15z3-workers
+  #   - inst-jgvhh-r15z3-workers
+  #   - inst-edsue-r15z3-workers
+  #   - inst-nv70l-r15z3-workers
+  #   - inst-yg289-r15z3-workers
+  #   - inst-daiox-r15z3-workers
+  #   - inst-tg5bs-r15z3-workers
+  #   - inst-lwagu-r15z3-workers
+  #   - inst-ed8jl-r15z3-workers
+  #   - inst-vwwku-r15z3-workers
+  #   - inst-go2bm-r15z3-workers
+  #   - inst-rdvlq-r15z3-workers
+  #   - inst-csom5-r15z3-workers
+  #   - inst-ht0xx-r15z3-workers
+  #   - inst-ij1rg-r15z3-workers
+  #   - inst-xoiov-r15z3-workers
+  #   - inst-fdyxp-r15z3-workers
+  #   - inst-0mf4w-r15z3-workers
+  #   - inst-awtjo-r15z3-workers
+  #   - inst-xh87c-r15z3-workers
+  #   - inst-bn5zq-r15z3-workers
+  #   - inst-ekaiy-r15z3-workers
+  #   - inst-o186f-r15z3-workers
+  #   - inst-pzgox-r15z3-workers
+  #   - inst-xtbwa-r15z3-workers
+  #   - inst-rpmhf-r15z3-workers
+  #   - inst-mrxmj-r15z3-workers
+  #   - inst-kx7fu-r15z3-workers
+  #   - inst-1nnph-r15z3-workers
+  #   - inst-cm3ec-r15z3-workers
+  #   - inst-pbivr-r15z3-workers
+  #   - inst-kdmu6-r15z3-workers
+  #   - inst-pfzsm-r15z3-workers
+  #   - inst-6tz4b-r15z3-workers
+  #   - inst-4ki3x-r15z3-workers
+  #   - inst-vy0zb-r15z3-workers
+  #   - inst-aixwt-r15z3-workers
+  #   - inst-dhjn2-r15z3-workers
+  #   - inst-vvd97-r15z3-workers
+  #   - inst-jhhcv-r15z3-workers
+  #   - inst-j3mfc-r15z3-workers
+  #   - inst-grtmk-r15z3-workers
+  #   - inst-2iaxk-r15z3-workers
+  #   - inst-drkao-r15z3-workers
+  #   - inst-cupyv-r15z3-workers
+  #   - inst-rnyqr-r15z3-workers
+  #   - inst-lpz5k-r15z3-workers
+  #   - inst-jmxxa-r15z3-workers
+  #   - inst-ih7jm-r15z3-workers
+  #   - inst-bluc6-r15z3-workers
+  #   - inst-di0ri-r15z3-workers
+  #   - inst-f0kqy-r15z3-workers
+  #   - inst-8jhc4-r15z3-workers
+  #   - inst-kxpsv-r15z3-workers
+  #   - inst-ll38i-r15z3-workers
+  #   - inst-v87vf-r15z3-workers
+  #   - inst-vv7fg-r15z3-workers
+  #   - inst-ijtgf-r15z3-workers
+  #   - inst-p5r5p-r15z3-workers
+  #   - inst-jhqyu-r15z3-workers
+  #   - inst-kdqg8-r15z3-workers
+  #   - inst-dpvjh-r15z3-workers
+  #   - inst-21fqf-r15z3-workers
+  #   - inst-qc1pa-r15z3-workers
+  #   - inst-vaqst-r15z3-workers
+  #   - inst-wrucg-r15z3-workers
+  #   - inst-5wqam-r15z3-workers
+  #   - inst-likvg-r15z3-workers
+  #   - inst-vzhyo-r15z3-workers
+  #   - inst-w4gwj-r15z3-workers
+  #   - inst-irzic-r15z3-workers
+  #   - inst-glcak-r15z3-workers
+  #   - inst-i6mnk-r15z3-workers
+  #   - inst-vjsri-r15z3-workers
+  #   - inst-rymxc-r15z3-workers
+  #   - inst-xalw1-r15z3-workers
+  #   - inst-hvw6t-r15z3-workers
+  #   - inst-xmxc2-r15z3-workers
+  #   - inst-6yvq9-r15z3-workers
+  #   - inst-r01sx-r15z3-workers
+  #   - inst-entnk-r15z3-workers
+  #   - inst-olazl-r15z3-workers
+  #   - inst-c6t2k-r15z3-workers
+  #   - inst-fatfc-r15z3-workers
+  #   - inst-evbig-r15z3-workers
+  #   - inst-e1ijl-r15z3-workers
+  #   - inst-tcttd-r15z3-workers
+  #   - inst-8z7hr-r15z3-workers
+  #   - inst-lduqx-r15z3-workers
+  #   - inst-xdqqd-r15z3-workers
+  #   - inst-znfjw-r15z3-workers
+  #   - inst-bv9yy-r15z3-workers
+  #   - inst-pyzpn-r15z3-workers
+  #   - inst-xej4c-r15z3-workers
+  #   - inst-i9qwf-r15z3-workers
+  #   - inst-v8mxi-r15z3-workers
+  #   - inst-hzzsd-r15z3-workers
+  #   - inst-tfi9t-r15z3-workers
+  #   - inst-bg14o-r15z3-workers
+  #   - inst-gggd1-r15z3-workers
+  #   - inst-gn4hg-r15z3-workers
+  #   - inst-9hoiv-r15z3-workers
+  #   - inst-rtaii-r15z3-workers
+  #   - inst-g5ojd-r15z3-workers
+  #   - inst-i1ted-r15z3-workers
+  #   - inst-o3fxl-r15z3-workers
+  #   - inst-vwnx8-r15z3-workers
+  #   # - inst-aj1o1-r15z3-workers
+  #   # - inst-tw9i6-r15z3-workers
+  #   # - inst-tturo-r15z3-workers
+  #   # - inst-97xv1-r15z3-workers
+  #   # - inst-kc1z1-r15z3-workers
+  #   # Bad nodes
+  #   # - inst-v2vx0-r15z3-workers
 integrations:
   - integration_type: git_repo
     git_repo: allenai/OLMo

From c541af1f8b563d76009b6d72c9d727ad89050dd1 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Wed, 8 May 2024 14:12:57 -0700
Subject: [PATCH 11/49] lumi config

---
 configs/pile-llamaish7.yaml | 227 ++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 configs/pile-llamaish7.yaml

diff --git a/configs/pile-llamaish7.yaml b/configs/pile-llamaish7.yaml
new file mode 100644
index 000000000..b7e1cdcc4
--- /dev/null
+++ b/configs/pile-llamaish7.yaml
@@ -0,0 +1,227 @@
+run_name: pile-llamaish7-001
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-medium
+  group: pile-llamaish7
+
+model:
+  d_model: 4096
+  n_heads: 32
+  n_layers: 32
+  # mlp_ratio: 6
+  mlp_hidden_size: 22016
+  weight_tying: false
+  alibi: false
+  rope: true
+  flash_attention: true
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  clip_qkv: 8.0
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: default
+  layer_norm_with_affine: false
+  bias_for_layer_norm: false
+  attention_layer_norm_with_affine: false
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 4096
+  vocab_size: 50280
+  embedding_size: 50304
+  eos_token_id: 0
+  pad_token_id: 1
+  init_device: meta
+  init_fn: full_megatron
+  init_std: 0.006
+  init_cutoff_factor: 3
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 3.0e-4
+  weight_decay: 0.1
+  decay_norm_and_bias: true
+  decay_embeddings: false
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 10
+
+scheduler:
+  name: cosine_with_warmup
+  units: tokens
+  t_warmup: 20971520000
+  t_max: 3e12
+  alpha_f: 0.1
+  grad_clip_warmup_steps: 2097152000
+  grad_clip_warmup_factor: 5
+
+tokenizer:
+  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
+  truncate_direction: right
+
+save_folder: runs/${run_name}
+remote_save_folder: s3://ai2-llm/checkpoints/OLMo-medium/${run_name}
+save_overwrite: false
+
+save_interval: 1000
+save_interval_ephemeral: null
+save_num_checkpoints_to_keep: -1
+sharded_checkpointer: olmo_core
+
+save_interval_unsharded: null
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+max_duration: 2ep
+global_train_batch_size: 512
+device_train_microbatch_size: 2
+
+precision: amp_bf16
+
+fsdp:
+  wrapping_strategy: by_block_and_size
+  precision: mixed
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 1
+
+eval_interval: 1000
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  - label: all-small-ppl-validation
+    data:
+      num_workers: 0
+      drop_last: true
+      datasets:
+        c4_en-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy
+        dolma_books-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy
+        dolma_common-crawl-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy
+        dolma_pes2o-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy
+        dolma_reddit-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy
+        dolma_stack-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy
+        dolma_wiki-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy
+        ice-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy
+        m2d2_s2orc-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy
+        pile-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy
+        wikitext_103-validation:
+          - ${oc.env:EVAL_DATA_PATH}/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy
+
+  ##########################
+  # Downstream evaluations #
+  ##########################
+  - label: piqa
+    type: downstream
+
+  - label: hellaswag
+    type: downstream
+
+  - label: winogrande
+    type: downstream
+
+  - label: openbook_qa
+    type: downstream
+
+  - label: boolq
+    type: downstream
+ 
+  - label: sciq
+    type: downstream
+
+  - label: arc_easy
+    type: downstream
+
+  - label: arc_challenge
+    type: downstream
+
+  - label: copa
+    type: downstream
+
+  #- label: rte
+  #  type: downstream
+
+  #- label: commitment_bank
+  #  type: downstream
+
+  #- label: sst2
+  #  type: downstream
+
+  - label: commonsense_qa
+    type: downstream
+
+  - label: social_iqa
+    type: downstream
+
+  # Doesn't work from cache.
+  # - label: basic_arithmetic
+  #   type: downstream
+
+  - label: mmlu_stem_var
+    type: downstream
+
+  - label: mmlu_humanities_var
+    type: downstream
+
+  - label: mmlu_social_sciences_var
+    type: downstream
+
+  - label: mmlu_other_var
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot
+    type: downstream
+
+  - label: mmlu_other_mc_5shot
+    type: downstream
+
+  - label: mmlu_stem_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_humanities_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_social_sciences_mc_5shot_test
+    type: downstream
+
+  - label: mmlu_other_mc_5shot_test
+    type: downstream
+
+data:
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
+  paths: ${path.glob:${oc.env:DATA_PATH}/pile/gpt-neox-20b-pii-special/*.npy}
+  pad_direction: right
+  num_workers: 0
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 16
+  persistent_workers: true
+  timeout: 0
\ No newline at end of file

From 4095258f19ff4100eed9ac822c7288c47edbb808 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Thu, 9 May 2024 02:40:14 +0300
Subject: [PATCH 12/49] lumi script

---
 scripts/lumi/pile-llamaish7.sh | 65 ++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 scripts/lumi/pile-llamaish7.sh

diff --git a/scripts/lumi/pile-llamaish7.sh b/scripts/lumi/pile-llamaish7.sh
new file mode 100644
index 000000000..c032ddbe3
--- /dev/null
+++ b/scripts/lumi/pile-llamaish7.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#SBATCH --job-name=pile-llamaish7
+#SBATCH --account=project_462000229
+#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log
+#SBATCH --nodes=8             # Total number of nodes 
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8       # Allocate one gpu per MPI rank
+#SBATCH --cpus-per-task=6
+#SBATCH --time=39:15:00
+#SBATCH --mem=0			# All memory on the node
+#SBATCH --partition=standard-g
+
+WANDB_GROUP=$1
+shift
+
+export OLMO_CONTAINER=llm-lumi-torch21_latest.sif
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export MPICH_GPU_SUPPORT_ENABLED=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET_GDR_LEVEL=3
+export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID}
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+export CXI_FORK_SAFE=1
+export CXI_FORK_SAFE_HP=1
+export FI_CXI_DISABLE_CQ_HUGETLB=1
+
+# We need to set this to avoid "Cassini Event Queue overflow detected." errors.
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+
+#export NCCL_DEBUG=INFO
+export PYTHONPATH=.:${PYTHONPATH}
+export ROCM_PATH=/opt/rocm
+export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64
+
+# Try playing with max_split_size_mb if you run into OOM errors.
+#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128
+
+export HF_DATASETS_OFFLINE=1
+
+export DATA_PATH=$FLASH_DIR/preprocessed/pile
+export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints
+export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data
+
+srun \
+  --cpus-per-task=$SLURM_CPUS_PER_TASK \
+  --distribution=block:block \
+  --kill-on-bad-exit \
+  scripts/run_with_environment.sh \
+    singularity exec \
+    -B"$PROJECT_DIR:$PROJECT_DIR" \
+    -B"$FLASH_DIR:$FLASH_DIR" \
+    -B"$SCRATCH_DIR:$SCRATCH_DIR" \
+    -B /opt/cray:/opt/cray \
+    -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
+    -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
+    $PROJECT_DIR/containers/$OLMO_CONTAINER \
+    python scripts/train.py configs/pile-llamaish7.yaml ${@} \
+      --run_name=${SLURM_JOB_ID} \
+      --activation_checkpointing=fine_grained \
+      --fsdp.wrapping_strategy=one_in_four \
+      --fsdp.sharding_strategy=FULL_SHARD \
+      --sharded_checkpointer=local \
+      --time_limit=$((39 * 60 * 60)) \
+      --wandb.group=$WANDB_GROUP

From 92496f46982eafb2bdba8c3e74ec76cfe4c7cb7f Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Thu, 9 May 2024 08:41:27 -0700
Subject: [PATCH 13/49] jupiter

---
 scripts/beaker/pile-llamaish7-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/pile-llamaish7-launch.sh b/scripts/beaker/pile-llamaish7-launch.sh
index 1a323f453..f2c628c2e 100755
--- a/scripts/beaker/pile-llamaish7-launch.sh
+++ b/scripts/beaker/pile-llamaish7-launch.sh
@@ -10,7 +10,7 @@ gantry run \
   --description "OLMo medium - 7B - Llamaish - Pile" \
   --priority high \
   --beaker-image shanea/olmo-torch2.2-gantry \
-  --cluster ai2/pluto-cirrascale \
+  --cluster ai2/jupiter-cirrascale \
   --gpus 8 \
   --replicas "${NUM_NODES}" \
   --leader-selection \

From c1816db7a9260ec1b656238cf5e1f52d7067fed5 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Thu, 9 May 2024 09:26:08 -0700
Subject: [PATCH 14/49] increase sync timeout

---
 scripts/beaker/pile-llamaish7-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/pile-llamaish7-launch.sh b/scripts/beaker/pile-llamaish7-launch.sh
index f2c628c2e..129ffb0e1 100755
--- a/scripts/beaker/pile-llamaish7-launch.sh
+++ b/scripts/beaker/pile-llamaish7-launch.sh
@@ -18,7 +18,7 @@ gantry run \
   --budget ai2/oe-training \
   --no-nfs \
   --propagate-failure \
-  --synchronized-start-timeout 10m \
+  --synchronized-start-timeout 20m \
   --env LOG_FILTER_TYPE=local_rank0_only \
   --env OMP_NUM_THREADS=8 \
   --env OLMO_TASK=model \

From d37125aa7126260e0a0e0a2eee84e1779095bd74 Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Thu, 9 May 2024 11:31:52 -0700
Subject: [PATCH 15/49] load from path

---
 scripts/beaker/pile-llamaish7.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/pile-llamaish7.sh b/scripts/beaker/pile-llamaish7.sh
index 46d5b26a2..e87feff86 100755
--- a/scripts/beaker/pile-llamaish7.sh
+++ b/scripts/beaker/pile-llamaish7.sh
@@ -44,4 +44,4 @@ torchrun \
     --optimizer.metrics_log_interval=1
 
     #--save_overwrite \
-    #'--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/llamaish7/}'
\ No newline at end of file
+    '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/pile-llamaish7/}'

From 45e407bb71fbdae62b22f5ffd5b7b2e978be220c Mon Sep 17 00:00:00 2001
From: Akshita Bhagia <akshita23bhagia@gmail.com>
Date: Thu, 9 May 2024 11:40:57 -0700
Subject: [PATCH 16/49] fix

---
 scripts/beaker/pile-llamaish7.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/beaker/pile-llamaish7.sh b/scripts/beaker/pile-llamaish7.sh
index e87feff86..f2e8fcf5c 100755
--- a/scripts/beaker/pile-llamaish7.sh
+++ b/scripts/beaker/pile-llamaish7.sh
@@ -41,7 +41,5 @@ torchrun \
     --global_train_batch_size=1024 \
     --save_interval=50 \
     --eval_interval=50 \
-    --optimizer.metrics_log_interval=1
-
-    #--save_overwrite \
+    --optimizer.metrics_log_interval=1 \
     '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/pile-llamaish7/}'

From 420d6dd2e97b5287bbcdbe38917a3f4c64db947e Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Fri, 10 May 2024 16:46:52 -0700
Subject: [PATCH 17/49] update config for restart

---
 configs/mcli/mitchish70.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
index 63b1a58ed..7721ece1d 100644
--- a/configs/mcli/mitchish70.yaml
+++ b/configs/mcli/mitchish70.yaml
@@ -183,7 +183,7 @@ command: |-
   scripts/train.py configs/mitchish70-s3.yaml \
     --run_name=mitchish70-pland \
     '--wandb.group=${run_name}' \
-    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planc/step197000 \
+    '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
     --load_path_sharded_checkpointer=olmo_core \
     --sharded_checkpointer=olmo_core \
     --optimizer.learning_rate=0.000075 \
@@ -199,6 +199,7 @@ command: |-
 # --fsdp.hybrid_sharding_num_model_replicas=4 \
 # 
 #    '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
+#    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planc/step197000 \
 #    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step48950 \
 #    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step49000 \
 #    --load_path=/root/checkpoint-unsharded \

From 05ad705fab023fa41fe1125bb888b013a095de73 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Mon, 13 May 2024 08:53:05 -0700
Subject: [PATCH 18/49] update config

---
 configs/mcli/mitchish70.yaml | 119 -----------------------------------
 1 file changed, 119 deletions(-)

diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
index 7721ece1d..951990fda 100644
--- a/configs/mcli/mitchish70.yaml
+++ b/configs/mcli/mitchish70.yaml
@@ -12,125 +12,6 @@ compute:
   gpu_type: h100_80gb
   instance: oci.bm.gpu.h100.8
   # node_names:
-  #   - inst-jeel7-r15z3-workers
-  #   - inst-ubbqk-r15z3-workers
-  #   - inst-bsgg4-r15z3-workers
-  #   - inst-ivjqi-r15z3-workers
-  #   - inst-2oyig-r15z3-workers
-  #   - inst-5irk5-r15z3-workers
-  #   - inst-jgvhh-r15z3-workers
-  #   - inst-edsue-r15z3-workers
-  #   - inst-nv70l-r15z3-workers
-  #   - inst-yg289-r15z3-workers
-  #   - inst-daiox-r15z3-workers
-  #   - inst-tg5bs-r15z3-workers
-  #   - inst-lwagu-r15z3-workers
-  #   - inst-ed8jl-r15z3-workers
-  #   - inst-vwwku-r15z3-workers
-  #   - inst-go2bm-r15z3-workers
-  #   - inst-rdvlq-r15z3-workers
-  #   - inst-csom5-r15z3-workers
-  #   - inst-ht0xx-r15z3-workers
-  #   - inst-ij1rg-r15z3-workers
-  #   - inst-xoiov-r15z3-workers
-  #   - inst-fdyxp-r15z3-workers
-  #   - inst-0mf4w-r15z3-workers
-  #   - inst-awtjo-r15z3-workers
-  #   - inst-xh87c-r15z3-workers
-  #   - inst-bn5zq-r15z3-workers
-  #   - inst-ekaiy-r15z3-workers
-  #   - inst-o186f-r15z3-workers
-  #   - inst-pzgox-r15z3-workers
-  #   - inst-xtbwa-r15z3-workers
-  #   - inst-rpmhf-r15z3-workers
-  #   - inst-mrxmj-r15z3-workers
-  #   - inst-kx7fu-r15z3-workers
-  #   - inst-1nnph-r15z3-workers
-  #   - inst-cm3ec-r15z3-workers
-  #   - inst-pbivr-r15z3-workers
-  #   - inst-kdmu6-r15z3-workers
-  #   - inst-pfzsm-r15z3-workers
-  #   - inst-6tz4b-r15z3-workers
-  #   - inst-4ki3x-r15z3-workers
-  #   - inst-vy0zb-r15z3-workers
-  #   - inst-aixwt-r15z3-workers
-  #   - inst-dhjn2-r15z3-workers
-  #   - inst-vvd97-r15z3-workers
-  #   - inst-jhhcv-r15z3-workers
-  #   - inst-j3mfc-r15z3-workers
-  #   - inst-grtmk-r15z3-workers
-  #   - inst-2iaxk-r15z3-workers
-  #   - inst-drkao-r15z3-workers
-  #   - inst-cupyv-r15z3-workers
-  #   - inst-rnyqr-r15z3-workers
-  #   - inst-lpz5k-r15z3-workers
-  #   - inst-jmxxa-r15z3-workers
-  #   - inst-ih7jm-r15z3-workers
-  #   - inst-bluc6-r15z3-workers
-  #   - inst-di0ri-r15z3-workers
-  #   - inst-f0kqy-r15z3-workers
-  #   - inst-8jhc4-r15z3-workers
-  #   - inst-kxpsv-r15z3-workers
-  #   - inst-ll38i-r15z3-workers
-  #   - inst-v87vf-r15z3-workers
-  #   - inst-vv7fg-r15z3-workers
-  #   - inst-ijtgf-r15z3-workers
-  #   - inst-p5r5p-r15z3-workers
-  #   - inst-jhqyu-r15z3-workers
-  #   - inst-kdqg8-r15z3-workers
-  #   - inst-dpvjh-r15z3-workers
-  #   - inst-21fqf-r15z3-workers
-  #   - inst-qc1pa-r15z3-workers
-  #   - inst-vaqst-r15z3-workers
-  #   - inst-wrucg-r15z3-workers
-  #   - inst-5wqam-r15z3-workers
-  #   - inst-likvg-r15z3-workers
-  #   - inst-vzhyo-r15z3-workers
-  #   - inst-w4gwj-r15z3-workers
-  #   - inst-irzic-r15z3-workers
-  #   - inst-glcak-r15z3-workers
-  #   - inst-i6mnk-r15z3-workers
-  #   - inst-vjsri-r15z3-workers
-  #   - inst-rymxc-r15z3-workers
-  #   - inst-xalw1-r15z3-workers
-  #   - inst-hvw6t-r15z3-workers
-  #   - inst-xmxc2-r15z3-workers
-  #   - inst-6yvq9-r15z3-workers
-  #   - inst-r01sx-r15z3-workers
-  #   - inst-entnk-r15z3-workers
-  #   - inst-olazl-r15z3-workers
-  #   - inst-c6t2k-r15z3-workers
-  #   - inst-fatfc-r15z3-workers
-  #   - inst-evbig-r15z3-workers
-  #   - inst-e1ijl-r15z3-workers
-  #   - inst-tcttd-r15z3-workers
-  #   - inst-8z7hr-r15z3-workers
-  #   - inst-lduqx-r15z3-workers
-  #   - inst-xdqqd-r15z3-workers
-  #   - inst-znfjw-r15z3-workers
-  #   - inst-bv9yy-r15z3-workers
-  #   - inst-pyzpn-r15z3-workers
-  #   - inst-xej4c-r15z3-workers
-  #   - inst-i9qwf-r15z3-workers
-  #   - inst-v8mxi-r15z3-workers
-  #   - inst-hzzsd-r15z3-workers
-  #   - inst-tfi9t-r15z3-workers
-  #   - inst-bg14o-r15z3-workers
-  #   - inst-gggd1-r15z3-workers
-  #   - inst-gn4hg-r15z3-workers
-  #   - inst-9hoiv-r15z3-workers
-  #   - inst-rtaii-r15z3-workers
-  #   - inst-g5ojd-r15z3-workers
-  #   - inst-i1ted-r15z3-workers
-  #   - inst-o3fxl-r15z3-workers
-  #   - inst-vwnx8-r15z3-workers
-  #   # - inst-aj1o1-r15z3-workers
-  #   # - inst-tw9i6-r15z3-workers
-  #   # - inst-tturo-r15z3-workers
-  #   # - inst-97xv1-r15z3-workers
-  #   # - inst-kc1z1-r15z3-workers
-  #   # Bad nodes
-  #   # - inst-v2vx0-r15z3-workers
 integrations:
   - integration_type: git_repo
     git_repo: allenai/OLMo

From 06fc94f44426f01b2408db98b8c39912dd313278 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Tue, 23 Apr 2024 08:44:04 -0700
Subject: [PATCH 19/49] Try stop wandb from flaking at cancel

---
 olmo/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/olmo/train.py b/olmo/train.py
index b2f0fcb28..26ec3a908 100644
--- a/olmo/train.py
+++ b/olmo/train.py
@@ -925,6 +925,7 @@ def check_if_cancelled(self) -> Tuple[bool, int]:
                 # Finally, check if someone canceled the run from W&B by adding the 'cancel' / 'canceled' tag..
                 # We won't see it in the run object. So we have to use the import/export API to check.
                 from requests.exceptions import RequestException
+                from wandb.errors import CommError
 
                 try:
                     api = wandb.Api(api_key=api_key)
@@ -935,7 +936,7 @@ def check_if_cancelled(self) -> Tuple[bool, int]:
                             cancel_reason = "Weights & Biases tag"
                             extra_steps = self.cfg.extra_steps_after_cancel
                             break
-                except RequestException:
+                except (RequestException, CommError):
                     pass
 
         run_canceled = synchronize_flag(should_cancel, self.device)

From 1e603a80559ee54930cfbe8c14505a27aa576fc3 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 15 May 2024 23:36:22 -0700
Subject: [PATCH 20/49] Configure mitchish7 to continue (with the datafix)

---
 scripts/beaker/mitchish7.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/beaker/mitchish7.sh b/scripts/beaker/mitchish7.sh
index 34aa66898..d91c6eaac 100755
--- a/scripts/beaker/mitchish7.sh
+++ b/scripts/beaker/mitchish7.sh
@@ -40,5 +40,5 @@ torchrun \
     --global_train_batch_size=1024 \
     --gen1_gc_interval=32 \
     --save_overwrite \
-    '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/}' \
-    --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step614000/
\ No newline at end of file
+    '--load_path=${path.last_checkpoint:s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/}'
+    # --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step614000/
\ No newline at end of file

From 52cfe84b9f06d2c53e0541a5daece3372d7b7a15 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 15 May 2024 23:37:43 -0700
Subject: [PATCH 21/49] I am normal.

---
 scripts/beaker/mitchish7-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/mitchish7-launch.sh b/scripts/beaker/mitchish7-launch.sh
index e6130365b..42ace192e 100755
--- a/scripts/beaker/mitchish7-launch.sh
+++ b/scripts/beaker/mitchish7-launch.sh
@@ -8,7 +8,7 @@ gantry run \
   --workspace ai2/dirkg \
   --task-name mitchish7 \
   --description "OLMo medium - 7B" \
-  --priority high \
+  --priority normal \
   --beaker-image shanea/olmo-torch2.2-gantry \
   --cluster ai2/pluto-cirrascale \
   --gpus 8 \

From 5c8b115fdd7087d2131845b1e7ac7355ea17d119 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 15 May 2024 23:42:07 -0700
Subject: [PATCH 22/49] This is supposed to run on Jupiter.

---
 scripts/beaker/mitchish7-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/mitchish7-launch.sh b/scripts/beaker/mitchish7-launch.sh
index 42ace192e..3e41098a7 100755
--- a/scripts/beaker/mitchish7-launch.sh
+++ b/scripts/beaker/mitchish7-launch.sh
@@ -10,7 +10,7 @@ gantry run \
   --description "OLMo medium - 7B" \
   --priority normal \
   --beaker-image shanea/olmo-torch2.2-gantry \
-  --cluster ai2/pluto-cirrascale \
+  --cluster ai2/jupiter-cirrascale \
   --gpus 8 \
   --replicas "${NUM_NODES}" \
   --leader-selection \

From 45acb65804ebef59273defeb47516e314baa666a Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Fri, 17 May 2024 17:57:08 -0700
Subject: [PATCH 23/49] Config for a restart with constant LR

---
 configs/mcli/mitchish70-from160510.yaml | 101 ++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 configs/mcli/mitchish70-from160510.yaml

diff --git a/configs/mcli/mitchish70-from160510.yaml b/configs/mcli/mitchish70-from160510.yaml
new file mode 100644
index 000000000..e0fdd1314
--- /dev/null
+++ b/configs/mcli/mitchish70-from160510.yaml
@@ -0,0 +1,101 @@
+name: olmo-70b-from160510
+image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
+# image: public.ecr.aws/z0f8p3z5/olmo:pytorch2.2.1_cu121-python3.11-ubuntu20.04
+# image: us-central1-docker.pkg.dev/ai2-olmo/olmo/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
+scheduling:
+  priority: auto
+  # preemptible: true  # means it can be retried
+  # max_retries: 10
+compute:
+  cluster: r15z4
+  gpus: 896
+  gpu_type: h100_80gb
+  instance: oci.bm.gpu.h100.8
+  # node_names:
+integrations:
+  - integration_type: git_repo
+    git_repo: allenai/OLMo
+    git_branch: train-olmo-large
+    pip_install: -e .[train]
+    ssh_clone: true
+  - integration_type: git_repo
+    git_repo: allenai/OLMo-core
+    git_branch: main
+    pip_install: -e .
+    ssh_clone: true
+env_variables:
+  PIP_DISABLE_PIP_VERSION_CHECK: "1"
+  OMP_NUM_THREADS: "8"
+  LOG_FILTER_TYPE: local_rank0_only
+command: |-
+  # Make sure we have a recent flash-attn.
+  # NOTE: only pinning flash-attn here to future proof it.
+  pip install flash-attn==2.5.3 --no-build-isolation
+  # Install AWS CLI (for pre-downloading unsharded checkpoints).
+  pip install awscli
+
+  # Show packages for debugging.
+  pip freeze
+
+  # Prepare environment.
+  mkdir -p /root/.cache/torch
+  # warm up huggingface cache
+  pushd /root/.cache
+  curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar -xzf -
+  popd
+  export HF_DATASETS_OFFLINE=1
+
+  #checkpoint=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step160500-unsharded-hacked
+  #mkdir /root/checkpoint-unsharded
+  #aws s3 cp --no-progress ${checkpoint}/config.yaml /root/checkpoint-unsharded/
+  #aws s3 cp --no-progress ${checkpoint}/train.pt /root/checkpoint-unsharded/
+  #aws s3 cp --no-progress ${checkpoint}/model.safetensors /root/checkpoint-unsharded/
+  #aws s3 cp --no-progress ${checkpoint}/optim.safetensors /root/checkpoint-unsharded/
+
+  cd OLMo
+
+  echo "Launching train script..."
+  torchrun \
+  --nproc_per_node 8 \
+  --nnodes 112:112 \
+  --rdzv_id=22232 \
+  --rdzv_backend=static \
+  --rdzv_endpoint=$MASTER_ADDR:29400 \
+  --node_rank=$NODE_RANK \
+  --rdzv_conf="read_timeout=420" \
+  scripts/train.py configs/mitchish70-s3.yaml \
+    --run_name=mitchish70-from160510 \
+    '--wandb.group=${run_name}' \
+    '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
+    --load_path_sharded_checkpointer=olmo_core \
+    --sharded_checkpointer=olmo_core \
+    --global_train_batch_size=3584 \
+    --device_train_microbatch_size=4 \
+    --fsdp.sharding_strategy=HYBRID_SHARD \
+    --fsdp.hybrid_sharding_num_model_replicas=4 \
+    --time_limit=604800 \
+    --save_overwrite \
+    --optimizer.learning_rate=3.0e-05 \
+    --scheduler.alpha_f=1.0 \
+    --scheduler.t_warmup=0 \
+    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planb/step160510
+
+#
+# --fsdp.sharding_strategy=HYBRID_SHARD \
+# --fsdp.hybrid_sharding_num_model_replicas=4 \
+# 
+#    '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
+#    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-planc/step197000 \
+#    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step48950 \
+#    --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step49000 \
+#    --load_path=/root/checkpoint-unsharded \
+#
+#  gpus: 256
+#    --global_train_batch_size=1536 \
+#  gpus: 384
+#    --global_train_batch_size=1536 \
+#    --device_train_microbatch_size=2 \
+#  gpus: 896
+#    --global_train_batch_size=1792 \
+#  gpus: 600  # (75 nodes)
+#    --global_train_batch_size=1800 \

From b394ac287c614006b8ee6e7086244a98583b6efc Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Sat, 18 May 2024 06:15:32 -0700
Subject: [PATCH 24/49] Fix OLMo-core

---
 configs/mcli/mitchish70.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/mcli/mitchish70.yaml b/configs/mcli/mitchish70.yaml
index 951990fda..f37bf0edf 100644
--- a/configs/mcli/mitchish70.yaml
+++ b/configs/mcli/mitchish70.yaml
@@ -20,7 +20,7 @@ integrations:
     ssh_clone: true
   - integration_type: git_repo
     git_repo: allenai/OLMo-core
-    git_branch: main
+    git_branch: WorksTorch22
     pip_install: -e .
     ssh_clone: true
 env_variables:

From 5f14392978b1a911f86891b80cf08311ebe8a745 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Sun, 19 May 2024 17:24:01 -0700
Subject: [PATCH 25/49] Need to specify nodes

---
 configs/mcli/mitchish70-from160510.yaml | 128 +++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

diff --git a/configs/mcli/mitchish70-from160510.yaml b/configs/mcli/mitchish70-from160510.yaml
index e0fdd1314..d2aadf037 100644
--- a/configs/mcli/mitchish70-from160510.yaml
+++ b/configs/mcli/mitchish70-from160510.yaml
@@ -11,7 +11,133 @@ compute:
   gpus: 896
   gpu_type: h100_80gb
   instance: oci.bm.gpu.h100.8
-  # node_names:
+  node_names:
+  - inst-ll38i-r15z3-workers
+  - inst-1nnph-r15z3-workers
+  - inst-edsue-r15z3-workers
+  - inst-kdmu6-r15z3-workers
+  - inst-tfi9t-r15z3-workers
+  - inst-vaqst-r15z3-workers
+  - inst-rpmhf-r15z3-workers
+  - inst-dpvjh-r15z3-workers
+  - inst-pfzsm-r15z3-workers
+  - inst-vvd97-r15z3-workers
+  - inst-entnk-r15z3-workers
+  - inst-awtjo-r15z3-workers
+  - inst-xdqqd-r15z3-workers
+  - inst-9hoiv-r15z3-workers
+  # - inst-mrkck-r15z3-workers  # bad
+  - inst-jhhcv-r15z3-workers
+  - inst-4ki3x-r15z3-workers
+  - inst-bsgg4-r15z3-workers
+  - inst-i9qwf-r15z3-workers
+  - inst-daiox-r15z3-workers
+  - inst-ijtgf-r15z3-workers
+  - inst-rymxc-r15z3-workers
+  - inst-uou7k-r15z3-workers
+  - inst-6yvq9-r15z3-workers
+  - inst-v8mxi-r15z3-workers
+  - inst-kx7fu-r15z3-workers
+  - inst-97xv1-r15z3-workers
+  - inst-vy0zb-r15z3-workers
+  - inst-csom5-r15z3-workers
+  - inst-jeel7-r15z3-workers
+  - inst-o186f-r15z3-workers
+  - inst-bluc6-r15z3-workers
+  - inst-toizy-r15z3-workers
+  - inst-vwwku-r15z3-workers
+  # - inst-ubbqk-r15z3-workers  # maybe bad
+  - inst-xalw1-r15z3-workers
+  - inst-grtmk-r15z3-workers
+  - inst-ytymh-r15z3-workers
+  - inst-e1ijl-r15z3-workers
+  - inst-vjsri-r15z3-workers
+  - inst-kc1z1-r15z3-workers
+  - inst-cm3ec-r15z3-workers
+  - inst-xtbwa-r15z3-workers
+  # - inst-lorl8-r15z3-workers  # bad
+  - inst-aixwt-r15z3-workers
+  - inst-i6mnk-r15z3-workers
+  - inst-bktpo-r15z3-workers
+  - inst-21fqf-r15z3-workers
+  - inst-ed8jl-r15z3-workers
+  - inst-5wqam-r15z3-workers
+  - inst-p1vaa-r15z3-workers
+  - inst-f0kqy-r15z3-workers
+  - inst-rnyqr-r15z3-workers
+  - inst-fdyxp-r15z3-workers
+  - inst-8jhc4-r15z3-workers
+  - inst-nv70l-r15z3-workers
+  # - inst-cupyv-r15z3-workers  # maybe bad
+  - inst-ij1rg-r15z3-workers
+  - inst-j3mfc-r15z3-workers
+  - inst-znfjw-r15z3-workers
+  - inst-5irk5-r15z3-workers
+  - inst-gn4hg-r15z3-workers
+  - inst-bn5zq-r15z3-workers
+  - inst-tw9i6-r15z3-workers
+  - inst-aj1o1-r15z3-workers
+  - inst-tturo-r15z3-workers
+  - inst-uwdwd-r15z3-workers
+  - inst-glcak-r15z3-workers
+  - inst-likvg-r15z3-workers
+  - inst-kxpsv-r15z3-workers
+  - inst-wrucg-r15z3-workers
+  - inst-xoiov-r15z3-workers
+  - inst-yg289-r15z3-workers
+  - inst-kdqg8-r15z3-workers
+  - inst-0mf4w-r15z3-workers
+  - inst-o3fxl-r15z3-workers
+  - inst-fatfc-r15z3-workers
+  - inst-lduqx-r15z3-workers
+  - inst-v87vf-r15z3-workers
+  - inst-r01sx-r15z3-workers
+  - inst-i1ted-r15z3-workers
+  - inst-vzhyo-r15z3-workers
+  - inst-evbig-r15z3-workers
+  - inst-di0ri-r15z3-workers
+  - inst-w4gwj-r15z3-workers
+  - inst-pzgox-r15z3-workers
+  - inst-2oyig-r15z3-workers
+  - inst-rdvlq-r15z3-workers
+  - inst-tcttd-r15z3-workers
+  - inst-tg5bs-r15z3-workers
+  - inst-xh87c-r15z3-workers
+  - inst-rtaii-r15z3-workers
+  - inst-go2bm-r15z3-workers
+  - inst-8z7hr-r15z3-workers
+  - inst-ekaiy-r15z3-workers
+  - inst-ht0xx-r15z3-workers
+  - inst-bg14o-r15z3-workers
+  - inst-mrxmj-r15z3-workers
+  - inst-olazl-r15z3-workers
+  - inst-eigqe-r15z3-workers
+  - inst-vwnx8-r15z3-workers
+  - inst-hzzsd-r15z3-workers
+  - inst-gggd1-r15z3-workers
+  - inst-xmxc2-r15z3-workers
+  - inst-39dwb-r15z3-workers
+  - inst-jhqyu-r15z3-workers
+  - inst-pbivr-r15z3-workers
+  - inst-jgvhh-r15z3-workers
+  - inst-vv7fg-r15z3-workers
+  - inst-lwagu-r15z3-workers
+  - inst-6tz4b-r15z3-workers
+  - inst-jmxxa-r15z3-workers
+  - inst-drkao-r15z3-workers
+  - inst-lpz5k-r15z3-workers
+  - inst-bv9yy-r15z3-workers
+  - inst-pyzpn-r15z3-workers
+  #- inst-ivjqi-r15z3-workers
+  #- inst-qc1pa-r15z3-workers
+  #- inst-hvw6t-r15z3-workers
+  #- inst-2iaxk-r15z3-workers
+  #- inst-dhjn2-r15z3-workers
+  #- inst-c6t2k-r15z3-workers
+  #- inst-ih7jm-r15z3-workers
+  #- inst-g5ojd-r15z3-workers
+  #- inst-irzic-r15z3-workers
+  #- inst-uh5f4-r15z3-workers
 integrations:
   - integration_type: git_repo
     git_repo: allenai/OLMo

From b8b4368a45e5e860ae34762fcb3fded77aa1be58 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Sun, 19 May 2024 18:57:18 -0700
Subject: [PATCH 26/49] Fix OLMo-core

---
 configs/mcli/mitchish70-from160510.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/mcli/mitchish70-from160510.yaml b/configs/mcli/mitchish70-from160510.yaml
index d2aadf037..995a5c279 100644
--- a/configs/mcli/mitchish70-from160510.yaml
+++ b/configs/mcli/mitchish70-from160510.yaml
@@ -146,7 +146,7 @@ integrations:
     ssh_clone: true
   - integration_type: git_repo
     git_repo: allenai/OLMo-core
-    git_branch: main
+    git_branch: WorksTorch22
     pip_install: -e .
     ssh_clone: true
 env_variables:

From 7a6d39dd195380b03dcc10ccecbb614b34134b71 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Sun, 19 May 2024 18:57:26 -0700
Subject: [PATCH 27/49] New nodes

---
 configs/mcli/mitchish70-from160510.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/mcli/mitchish70-from160510.yaml b/configs/mcli/mitchish70-from160510.yaml
index 995a5c279..85f853b2e 100644
--- a/configs/mcli/mitchish70-from160510.yaml
+++ b/configs/mcli/mitchish70-from160510.yaml
@@ -85,7 +85,7 @@ compute:
   - inst-wrucg-r15z3-workers
   - inst-xoiov-r15z3-workers
   - inst-yg289-r15z3-workers
-  - inst-kdqg8-r15z3-workers
+  #- inst-kdqg8-r15z3-workers
   - inst-0mf4w-r15z3-workers
   - inst-o3fxl-r15z3-workers
   - inst-fatfc-r15z3-workers
@@ -128,7 +128,7 @@ compute:
   - inst-lpz5k-r15z3-workers
   - inst-bv9yy-r15z3-workers
   - inst-pyzpn-r15z3-workers
-  #- inst-ivjqi-r15z3-workers
+  - inst-ivjqi-r15z3-workers
   #- inst-qc1pa-r15z3-workers
   #- inst-hvw6t-r15z3-workers
   #- inst-2iaxk-r15z3-workers

From a03bbf2e5945b3d5ce1d502f6c593ad7a6aaaaa7 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Mon, 20 May 2024 23:12:04 -0700
Subject: [PATCH 28/49] Continue running the 70B on Jupiter

---
 .../beaker/mitchish70-from160510-launch.sh    | 32 ++++++++++++++
 scripts/beaker/mitchish70-from160510.sh       | 42 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100755 scripts/beaker/mitchish70-from160510-launch.sh
 create mode 100755 scripts/beaker/mitchish70-from160510.sh

diff --git a/scripts/beaker/mitchish70-from160510-launch.sh b/scripts/beaker/mitchish70-from160510-launch.sh
new file mode 100755
index 000000000..5414519af
--- /dev/null
+++ b/scripts/beaker/mitchish70-from160510-launch.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=8
+
+gantry run \
+  --workspace ai2/dirkg \
+  --task-name mitchish70-from160510 \
+  --description "OLMo large - 70B - from160510" \
+  --priority high \
+  --beaker-image shanea/olmo-torch2.2-gantry \
+  --cluster ai2/jupiter-cirrascale \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --propagate-failure \
+  --synchronized-start-timeout 10m \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env-secret WANDB_API_KEY=WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
+  --shared-memory 10GiB \
+  --venv base \
+  --yes \
+  --timeout=-1 \
+  -- /bin/bash -c "scripts/beaker/mitchish70-from160510.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
diff --git a/scripts/beaker/mitchish70-from160510.sh b/scripts/beaker/mitchish70-from160510.sh
new file mode 100755
index 000000000..a35cfee4d
--- /dev/null
+++ b/scripts/beaker/mitchish70-from160510.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+set -exuo pipefail
+IFS=$'\n\t'
+
+BEAKER_LEADER_REPLICA_HOSTNAME=$1
+shift
+
+NUM_NODES=$1
+shift
+
+BEAKER_REPLICA_RANK=$1
+shift
+
+# Warm HF cache
+mkdir -p /root/.cache
+pushd /root/.cache
+curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v3.tar.gz" | tar --keep-newer-files -xzf -
+popd
+export HF_DATASETS_OFFLINE=1
+
+torchrun \
+  --nnodes ${NUM_NODES}:${NUM_NODES} \
+  --nproc-per-node 8 \
+  --rdzv_id=52346 \
+  --rdzv_backend=static \
+  --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \
+  --node_rank=$BEAKER_REPLICA_RANK \
+  --rdzv_conf="read_timeout=420" \
+  scripts/train.py \
+    configs/mitchish70-s3.yaml \
+      --run_name=mitchish70-from160510 \
+      '--wandb.group=${run_name}' \
+      '--load_path=${path.last_checkpoint:${remote_save_folder}}' \
+      --load_path_sharded_checkpointer=olmo_core \
+      --sharded_checkpointer=olmo_core \
+      --global_train_batch_size=3584 \
+      --device_train_microbatch_size=4 \
+      --fsdp.sharding_strategy=FULL_SHARD \
+      --save_overwrite \
+      --optimizer.learning_rate=3.0e-05 \
+      --scheduler.alpha_f=1.0 \
+      --scheduler.t_warmup=0

From 3f5f0191399122047bdf1ca7a3ac9586fa047a81 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Mon, 20 May 2024 23:14:56 -0700
Subject: [PATCH 29/49] Run in the proper workspace

---
 scripts/beaker/mitchish70-from160510-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/mitchish70-from160510-launch.sh b/scripts/beaker/mitchish70-from160510-launch.sh
index 5414519af..2dd3b62c9 100755
--- a/scripts/beaker/mitchish70-from160510-launch.sh
+++ b/scripts/beaker/mitchish70-from160510-launch.sh
@@ -5,7 +5,7 @@ set -ex
 NUM_NODES=8
 
 gantry run \
-  --workspace ai2/dirkg \
+  --workspace ai2/OLMo-training \
   --task-name mitchish70-from160510 \
   --description "OLMo large - 70B - from160510" \
   --priority high \

From 8fc56c1386483a557a2e2d82bf778ed768c66feb Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Mon, 20 May 2024 23:16:28 -0700
Subject: [PATCH 30/49] Run as me

---
 scripts/beaker/mitchish70-from160510-launch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/beaker/mitchish70-from160510-launch.sh b/scripts/beaker/mitchish70-from160510-launch.sh
index 2dd3b62c9..ebba6dd74 100755
--- a/scripts/beaker/mitchish70-from160510-launch.sh
+++ b/scripts/beaker/mitchish70-from160510-launch.sh
@@ -22,7 +22,7 @@ gantry run \
   --env LOG_FILTER_TYPE=local_rank0_only \
   --env OMP_NUM_THREADS=8 \
   --env OLMO_TASK=model \
-  --env-secret WANDB_API_KEY=WANDB_API_KEY \
+  --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \
   --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
   --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
   --shared-memory 10GiB \

From 8d74d6ce46f707df864b6356d63356a35fe9d707 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Tue, 21 May 2024 16:02:01 -0700
Subject: [PATCH 31/49] Run preemptible

---
 scripts/beaker/mitchish70-from160510-launch.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/beaker/mitchish70-from160510-launch.sh b/scripts/beaker/mitchish70-from160510-launch.sh
index ebba6dd74..77d91d78c 100755
--- a/scripts/beaker/mitchish70-from160510-launch.sh
+++ b/scripts/beaker/mitchish70-from160510-launch.sh
@@ -8,9 +8,10 @@ gantry run \
   --workspace ai2/OLMo-training \
   --task-name mitchish70-from160510 \
   --description "OLMo large - 70B - from160510" \
-  --priority high \
+  --priority normal \
   --beaker-image shanea/olmo-torch2.2-gantry \
   --cluster ai2/jupiter-cirrascale \
+  --preemptible \
   --gpus 8 \
   --replicas "${NUM_NODES}" \
   --leader-selection \

From 6f8a909134a2c66f7a664b752c6f9eea1f49f210 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Tue, 21 May 2024 16:03:15 -0700
Subject: [PATCH 32/49] Multiple clusters

---
 scripts/beaker/mitchish70-from160510-launch.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/beaker/mitchish70-from160510-launch.sh b/scripts/beaker/mitchish70-from160510-launch.sh
index 77d91d78c..7d0279ca0 100755
--- a/scripts/beaker/mitchish70-from160510-launch.sh
+++ b/scripts/beaker/mitchish70-from160510-launch.sh
@@ -11,6 +11,7 @@ gantry run \
   --priority normal \
   --beaker-image shanea/olmo-torch2.2-gantry \
   --cluster ai2/jupiter-cirrascale \
+  --cluster ai2/pluto-cirrascale \
   --preemptible \
   --gpus 8 \
   --replicas "${NUM_NODES}" \

From 96b1bcba0780f898cfceafde4035a860a154f881 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 10:03:13 -0700
Subject: [PATCH 33/49] Formatting

---
 olmo/checkpoint.py | 4 +++-
 olmo/train.py      | 9 ++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/olmo/checkpoint.py b/olmo/checkpoint.py
index f369888da..03e849b3e 100644
--- a/olmo/checkpoint.py
+++ b/olmo/checkpoint.py
@@ -55,7 +55,9 @@
     gc_cuda,
     get_fs_local_rank,
     get_global_rank,
-    get_world_size, get_local_world_size, get_local_rank,
+    get_world_size,
+    get_local_world_size,
+    get_local_rank,
 )
 from .util import (
     _get_s3_client,
diff --git a/olmo/train.py b/olmo/train.py
index 26ec3a908..893a804ab 100644
--- a/olmo/train.py
+++ b/olmo/train.py
@@ -641,7 +641,7 @@ def model_forward(
     def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # Split into micro-batches.
         micro_batches = self.split_batch(batch)
-        batch_size_in_tokens = batch['input_ids'].numel()
+        batch_size_in_tokens = batch["input_ids"].numel()
 
         # In case this helps with memory utilization.
         del batch
@@ -652,9 +652,7 @@ def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[tor
             with torch.autocast("cuda", enabled=True, dtype=self.cfg.autocast_precision):
                 # Run forward pass.
                 ce_loss, z_loss, logits = self.model_forward(
-                    micro_batch,
-                    compute_z_loss=self.cfg.softmax_auxiliary_loss,
-                    loss_reduction="sum"
+                    micro_batch, compute_z_loss=self.cfg.softmax_auxiliary_loss, loss_reduction="sum"
                 )
                 ce_loss = ce_loss / batch_size_in_tokens
 
@@ -836,7 +834,8 @@ def format_float(value: float) -> str:
                 [
                     f"    {name}={format_float(value)}"
                     for name, value in metrics.items()
-                    if name == "optim/total_grad_norm" or not name.startswith("optim/")  # there's too many optimizer metrics
+                    if name == "optim/total_grad_norm"
+                    or not name.startswith("optim/")  # there's too many optimizer metrics
                 ]
             )
         )

From a7e658848cfae0c8254c7b4bf3bf489842eb7095 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 10:03:57 -0700
Subject: [PATCH 34/49] Improve the script that dry-runs the dataloader

---
 scripts/run_dataloader.py | 88 +++++++++++++++++++++++----------------
 1 file changed, 51 insertions(+), 37 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index 36d87f9aa..a5b12608d 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -1,7 +1,10 @@
 import logging
-import sys
-import time
+from pathlib import Path
+from typing import Dict
+from tqdm import tqdm
 
+import safetensors.torch
+import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
@@ -10,24 +13,23 @@
 from olmo.data import build_memmap_dataset
 from olmo.data.collator import DataCollator
 from olmo.data.iterable_dataset import IterableDataset
-from olmo.exceptions import OLMoCliError
 from olmo.torch_util import seed_all
 from olmo.util import clean_opt, prepare_cli_environment
 
 log = logging.getLogger("run_dataloader")
 
 
-def main(cfg: TrainConfig) -> None:
-    # Set seed.
+def main(cfg: TrainConfig, output_dir: Path) -> None:
+    # Set seed
     seed_all(cfg.seed)
 
     # Set some additional settings
     if cfg.device_train_batch_size is None:
-        log.warning(
-            "device_train_batch_size is not set, so we're assuming we're running on 8 GPUs. "
-            "Set that value on the command line if this is not true."
-        )
-        cfg.device_train_batch_size = cfg.global_train_batch_size // 8
+        cfg.device_train_batch_size = cfg.global_train_batch_size
+    cfg.device_train_grad_accum = cfg.device_train_batch_size // cfg.device_train_microbatch_size
+    cfg.data.num_workers = 4
+    cfg.data.pin_memory = False
+    cfg.data.prefetch_factor = 4
 
     # Construct data loader.
     collator = DataCollator(pad_direction=cfg.data.pad_direction, pad_token_id=cfg.model.pad_token_id)
@@ -52,28 +54,39 @@ def main(cfg: TrainConfig) -> None:
         timeout=cfg.data.timeout,
     )
 
-    # Warm up the data loader
-    train_loader_iter = iter(train_loader)
-    next(train_loader_iter)
-
-    # Benchmark the dataloader
-    start_time = time.time()
-    last_log_time = start_time
-    batches_loaded = 0
-    for _ in train_loader_iter:
-        batches_loaded += 1
-        now = time.time()
-        if now - last_log_time > 1:
-            log.info(
-                "Read %d batches in %.2f seconds, %.2f batches per second",
-                batches_loaded,
-                now - start_time,
-                batches_loaded / (now - start_time),
-            )
-            last_log_time = now
+    batches_per_file = 1000
+    batches_read = 0
+    name_to_batches: Dict[str, torch.Tensor] = {}
+
+    for batch_number, batch in enumerate(tqdm(train_loader)):
+        for name, source_t in batch.items():
+            try:
+                target_t = name_to_batches[name]
+            except KeyError:
+                target_t = torch.zeros((batches_per_file,) + source_t.shape, dtype=source_t.dtype)
+                name_to_batches[name] = target_t
+            target_t[batches_read] = source_t
+        batches_read += 1
+
+        if batches_read >= batches_per_file:
+            file_start = batch_number - batches_per_file
+            file_end = batch_number
+            filename = output_dir / f"{file_start}-{file_end}.safetensors"
+            truncated_tensors = {n: t[:batches_read] for n, t in name_to_batches.items()}
+            safetensors.torch.save_file(truncated_tensors, filename)
+            batches_read = 0
 
 
 if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="replay the dataloader and write batches out to files")
+    parser.add_argument("-o", type=str, help="output directory")
+    parser.add_argument("config_file", type=str, help="config file")
+    args, other_args = parser.parse_known_args()
+    output_dir = Path(args.o)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
     try:
         mp.set_start_method("spawn", force=True)
     except RuntimeError as e:
@@ -85,13 +98,14 @@ def main(cfg: TrainConfig) -> None:
 
     log.info(f"multiprocessing start method set to '{mp.get_start_method()}'")
 
-    try:
-        yaml_path, args_list = sys.argv[1], sys.argv[2:]
-    except IndexError:
-        raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]")
-
-    args_list = [clean_opt(s) for s in args_list]
+    args_list = [clean_opt(s) for s in other_args]
     args_list.insert(0, "save_folder=runs/")
 
-    cfg = TrainConfig.load(yaml_path, args_list)
-    main(cfg)
+    cfg = TrainConfig.load(args.config_file, args_list)
+
+    # If you have the data downloaded locally, uncomment this and fix the path for a massive speedup.
+    # cfg.data.paths = [
+    #    p.replace("s3://", "/mnt/tank/") for p in cfg.data.paths
+    # ]
+
+    main(cfg, output_dir)

From 44f3c2d29a8a3e21fbc9c4e69db8bdf730262f60 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 10:28:55 -0700
Subject: [PATCH 35/49] Off by one

---
 scripts/run_dataloader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index a5b12608d..2ddf7b3e5 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -69,8 +69,8 @@ def main(cfg: TrainConfig, output_dir: Path) -> None:
         batches_read += 1
 
         if batches_read >= batches_per_file:
-            file_start = batch_number - batches_per_file
-            file_end = batch_number
+            file_start = batch_number - batches_per_file + 1
+            file_end = batch_number + 1
             filename = output_dir / f"{file_start}-{file_end}.safetensors"
             truncated_tensors = {n: t[:batches_read] for n, t in name_to_batches.items()}
             safetensors.torch.save_file(truncated_tensors, filename)

From 613715c8a6bd8e2afd375b5070fddf33f9fcbd4b Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 11:02:45 -0700
Subject: [PATCH 36/49] Save in torch format

---
 scripts/run_dataloader.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index 2ddf7b3e5..c13617f78 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -3,7 +3,6 @@
 from typing import Dict
 from tqdm import tqdm
 
-import safetensors.torch
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -71,9 +70,9 @@ def main(cfg: TrainConfig, output_dir: Path) -> None:
         if batches_read >= batches_per_file:
             file_start = batch_number - batches_per_file + 1
             file_end = batch_number + 1
-            filename = output_dir / f"{file_start}-{file_end}.safetensors"
-            truncated_tensors = {n: t[:batches_read] for n, t in name_to_batches.items()}
-            safetensors.torch.save_file(truncated_tensors, filename)
+            for name, t in name_to_batches.items():
+                filename = output_dir / f"{name}-{file_start}-{file_end}.pt"
+                torch.save(t[:batches_read], filename)
             batches_read = 0
 
 

From 1c223965b9975680b24423f8b12248256b80ffbf Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 11:16:28 -0700
Subject: [PATCH 37/49] Revert "Save in torch format"

This reverts commit 613715c8a6bd8e2afd375b5070fddf33f9fcbd4b.
---
 scripts/run_dataloader.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index c13617f78..2ddf7b3e5 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -3,6 +3,7 @@
 from typing import Dict
 from tqdm import tqdm
 
+import safetensors.torch
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -70,9 +71,9 @@ def main(cfg: TrainConfig, output_dir: Path) -> None:
         if batches_read >= batches_per_file:
             file_start = batch_number - batches_per_file + 1
             file_end = batch_number + 1
-            for name, t in name_to_batches.items():
-                filename = output_dir / f"{name}-{file_start}-{file_end}.pt"
-                torch.save(t[:batches_read], filename)
+            filename = output_dir / f"{file_start}-{file_end}.safetensors"
+            truncated_tensors = {n: t[:batches_read] for n, t in name_to_batches.items()}
+            safetensors.torch.save_file(truncated_tensors, filename)
             batches_read = 0
 
 

From 055eff77cf95aec3dba237c2a4a7d2bb1690f950 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 11:35:51 -0700
Subject: [PATCH 38/49] Reapply "Save in torch format"

This reverts commit 1c223965b9975680b24423f8b12248256b80ffbf.
---
 scripts/run_dataloader.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index 2ddf7b3e5..c13617f78 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -3,7 +3,6 @@
 from typing import Dict
 from tqdm import tqdm
 
-import safetensors.torch
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -71,9 +70,9 @@ def main(cfg: TrainConfig, output_dir: Path) -> None:
         if batches_read >= batches_per_file:
             file_start = batch_number - batches_per_file + 1
             file_end = batch_number + 1
-            filename = output_dir / f"{file_start}-{file_end}.safetensors"
-            truncated_tensors = {n: t[:batches_read] for n, t in name_to_batches.items()}
-            safetensors.torch.save_file(truncated_tensors, filename)
+            for name, t in name_to_batches.items():
+                filename = output_dir / f"{name}-{file_start}-{file_end}.pt"
+                torch.save(t[:batches_read], filename)
             batches_read = 0
 
 

From 446e90be5577a1f9175e225cd056b027e948f312 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 11:51:25 -0700
Subject: [PATCH 39/49] Torch doesn't have an uint16 type.

---
 scripts/run_dataloader.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index c13617f78..a5882c994 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -1,6 +1,8 @@
 import logging
 from pathlib import Path
 from typing import Dict
+
+import numpy as np
 from tqdm import tqdm
 
 import torch
@@ -55,14 +57,18 @@ def main(cfg: TrainConfig, output_dir: Path) -> None:
 
     batches_per_file = 1000
     batches_read = 0
-    name_to_batches: Dict[str, torch.Tensor] = {}
+    name_to_batches: Dict[str, np.array] = {}
 
     for batch_number, batch in enumerate(tqdm(train_loader)):
         for name, source_t in batch.items():
+            source_t = source_t.numpy()
+            if name == "input_ids":
+                assert source_t.max() <= 2**16
+                source_t = source_t.astype(np.uint16)
             try:
                 target_t = name_to_batches[name]
             except KeyError:
-                target_t = torch.zeros((batches_per_file,) + source_t.shape, dtype=source_t.dtype)
+                target_t = np.zeros((batches_per_file,) + source_t.shape, dtype=source_t.dtype)
                 name_to_batches[name] = target_t
             target_t[batches_read] = source_t
         batches_read += 1
@@ -71,8 +77,8 @@ def main(cfg: TrainConfig, output_dir: Path) -> None:
             file_start = batch_number - batches_per_file + 1
             file_end = batch_number + 1
             for name, t in name_to_batches.items():
-                filename = output_dir / f"{name}-{file_start}-{file_end}.pt"
-                torch.save(t[:batches_read], filename)
+                filename = output_dir / f"{name}-{file_start}-{file_end}.npy"
+                np.save(filename, t[:batches_read])
             batches_read = 0
 
 

From 99c71f7a7d27a4409a3e5953278ad5be4cdc8a40 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 12:44:22 -0700
Subject: [PATCH 40/49] Silence warning

---
 scripts/run_dataloader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index a5882c994..a694fffeb 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -5,7 +5,6 @@
 import numpy as np
 from tqdm import tqdm
 
-import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.utils.data import DataLoader

From bed4324920163e6703ba458d312465240e09216d Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 12:44:44 -0700
Subject: [PATCH 41/49] Silence another warning

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c1cf71bf4..5d201590a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -109,7 +109,7 @@ extend_skip = ["pretrain_data", "tokenizer"]
 
 [tool.ruff]
 line-length = 115
-ignore = ["F403", "F405", "E501"]
+lint.ignore = ["F403", "F405", "E501"]
 exclude = [
     ".bzr",
     ".direnv",

From 4beb98027c76fb64c291c0be928d24fb3694a554 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 12:55:43 -0700
Subject: [PATCH 42/49] isort

---
 scripts/run_dataloader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/run_dataloader.py b/scripts/run_dataloader.py
index a694fffeb..8fcca7ff5 100644
--- a/scripts/run_dataloader.py
+++ b/scripts/run_dataloader.py
@@ -3,11 +3,10 @@
 from typing import Dict
 
 import numpy as np
-from tqdm import tqdm
-
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 
 from olmo.config import TrainConfig
 from olmo.data import build_memmap_dataset

From 60d90f4a072743a6eb66633ce9b14e6b32029439 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 14:52:21 -0700
Subject: [PATCH 43/49] Run preemptible

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index e3948411e..554342299 100644
--- a/Makefile
+++ b/Makefile
@@ -68,6 +68,7 @@ gantry-test :
 	gantry run \
 		--workspace "$(BEAKER_WORKSPACE)" \
 		--priority "normal" \
+		--preemptible \
 		--beaker-image "$(GANTRY_IMAGE)" \
 		--gpus 1 \
 		--description "Test run" \
@@ -90,6 +91,7 @@ gantry-run-ib :
 	gantry run \
 		--workspace "$(BEAKER_WORKSPACE)" \
 		--priority "normal" \
+		--preemptible \
 		--beaker-image "$(GANTRY_IMAGE)" \
 		--gpus 8 \
 		--description "LLM Beaker IB Cluster Run" \

From d5912f9f9489349224e370398c0f391fea9e1ec8 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 14:52:28 -0700
Subject: [PATCH 44/49] More clusters

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 554342299..a3d6f6691 100644
--- a/Makefile
+++ b/Makefile
@@ -80,6 +80,8 @@ gantry-test :
 		--cluster ai2/s2-cirrascale \
 		--cluster ai2/general-cirrascale \
 		--cluster ai2/general-cirrascale-a100-80g-ib \
+		--cluster ai2/jupiter-cirrascale \
+		--cluster ai2/pluto-cirrascale \
 		--allow-dirty \
 		--venv base \
 		--timeout -1 \
@@ -96,6 +98,8 @@ gantry-run-ib :
 		--gpus 8 \
 		--description "LLM Beaker IB Cluster Run" \
 		--cluster ai2/general-cirrascale-a100-80g-ib \
+		--cluster ai2/jupiter-cirrascale \
+		--cluster ai2/pluto-cirrascale \
 		--nfs \
 		--env WORLD_SIZE=32 \
 		--env GPUS=8 \

From 16fad9dc98d837a6ce7a7aa2605ca19350510fa7 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 15:00:18 -0700
Subject: [PATCH 45/49] preemptible

---
 .github/workflows/main.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 222a1fcc2..7fd0d7b91 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -142,6 +142,7 @@ jobs:
                   beaker: ${{ env.BEAKER_IMAGE }}
                 context:
                   priority: normal
+                  preemptible: true
                 resources:
                   gpuCount: 1
                 constraints:

From 9383d2906f99048a67e253c91ef77431f803eaba Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 29 May 2024 17:26:22 -0700
Subject: [PATCH 46/49] Remove usages of Auto* methods in hf_olmo tests

---
 tests/hf_olmo/hf_pipeline_test.py            |  7 +++---
 tests/hf_olmo/modeling_olmo_test.py          | 25 ++++++--------------
 tests/hf_olmo/tokenization_olmo_fast_test.py | 15 ++++--------
 3 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/tests/hf_olmo/hf_pipeline_test.py b/tests/hf_olmo/hf_pipeline_test.py
index ba366e5f2..9a759ef3c 100644
--- a/tests/hf_olmo/hf_pipeline_test.py
+++ b/tests/hf_olmo/hf_pipeline_test.py
@@ -1,11 +1,10 @@
 def test_pipeline(model_path: str):
     from transformers import TextGenerationPipeline
-    from transformers.models.auto import AutoModelForCausalLM, AutoTokenizer
 
-    from hf_olmo.modeling_olmo import OLMoConfig, OLMoForCausalLM  # noqa: F401
+    from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast
 
-    model = AutoModelForCausalLM.from_pretrained(model_path)
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = OLMoForCausalLM.from_pretrained(model_path)
+    tokenizer = OLMoTokenizerFast.from_pretrained(model_path)
     pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
     output = pipeline("question: who wrote romeo and juliet? answer: ", max_new_tokens=30)
     assert "generated_text" in output[0]
diff --git a/tests/hf_olmo/modeling_olmo_test.py b/tests/hf_olmo/modeling_olmo_test.py
index e4bb02f54..2c06316db 100644
--- a/tests/hf_olmo/modeling_olmo_test.py
+++ b/tests/hf_olmo/modeling_olmo_test.py
@@ -3,18 +3,15 @@
 import pytest
 import torch
 
+from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast
 from olmo.model import OLMo
 
 
 def test_olmo_model(model_path: str):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast  # noqa: F401
-
     model = OLMo.from_checkpoint(model_path)
-    hf_model = AutoModelForCausalLM.from_pretrained(model_path)
+    hf_model = OLMoForCausalLM.from_pretrained(model_path)
 
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer = OLMoTokenizerFast.from_pretrained(model_path)
     input = tokenizer.encode("My name is OLMo!")
     input_tensor = torch.tensor(input).unsqueeze(0)
 
@@ -25,21 +22,17 @@ def test_olmo_model(model_path: str):
 
 
 def test_save_pretrained(model_path: str):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast  # noqa: F401
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer = OLMoTokenizerFast.from_pretrained(model_path)
     input = tokenizer.encode("My name is OLMo!")
     input_tensor = torch.tensor(input).unsqueeze(0)
 
-    hf_model = AutoModelForCausalLM.from_pretrained(model_path)
+    hf_model = OLMoForCausalLM.from_pretrained(model_path)
     hf_output = hf_model(input_tensor)
 
     with tempfile.TemporaryDirectory() as tmp_dir:
         hf_model.save_pretrained(tmp_dir)
 
-        saved_hf_model = AutoModelForCausalLM.from_pretrained(tmp_dir)
+        saved_hf_model = OLMoForCausalLM.from_pretrained(tmp_dir)
         saved_hf_output = saved_hf_model(input_tensor)
 
         torch.testing.assert_allclose(saved_hf_output.logits, hf_output.logits)
@@ -48,9 +41,5 @@ def test_save_pretrained(model_path: str):
 @pytest.mark.gpu
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA devices")
 def test_auto_device_map_load(model_path: str):
-    from transformers import AutoModelForCausalLM
-
-    from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast  # noqa: F401
-
-    hf_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+    hf_model = OLMoForCausalLM.from_pretrained(model_path, device_map="auto")
     assert hf_model.device.type == "cuda"
diff --git a/tests/hf_olmo/tokenization_olmo_fast_test.py b/tests/hf_olmo/tokenization_olmo_fast_test.py
index 10bb4f7dd..dac38deb3 100644
--- a/tests/hf_olmo/tokenization_olmo_fast_test.py
+++ b/tests/hf_olmo/tokenization_olmo_fast_test.py
@@ -1,15 +1,12 @@
 import tempfile
 
+from hf_olmo import OLMoTokenizerFast
 from olmo.tokenizer import Tokenizer
 
 
 def test_olmo_tokenizer(model_path: str):
-    from transformers import AutoTokenizer
-
-    from hf_olmo import OLMoTokenizerFast  # noqa: F401
-
     tok = Tokenizer.from_checkpoint(model_path)
-    hf_tok = AutoTokenizer.from_pretrained(model_path)
+    hf_tok = OLMoTokenizerFast.from_pretrained(model_path)
 
     input_str = "Hello, this is a test!"
 
@@ -26,11 +23,7 @@ def test_olmo_tokenizer(model_path: str):
 
 
 def test_save_pretrained(model_path: str):
-    from transformers import AutoTokenizer
-
-    from hf_olmo import OLMoTokenizerFast  # noqa: F401
-
-    hf_tok = AutoTokenizer.from_pretrained(model_path)
+    hf_tok = OLMoTokenizerFast.from_pretrained(model_path)
 
     input_str = "Hello, this is a test!"
 
@@ -40,7 +33,7 @@ def test_save_pretrained(model_path: str):
     with tempfile.TemporaryDirectory() as tmp_dir:
         hf_tok.save_pretrained(tmp_dir)
 
-        saved_hf_tok = AutoTokenizer.from_pretrained(tmp_dir)
+        saved_hf_tok = OLMoTokenizerFast.from_pretrained(tmp_dir)
         saved_hf_tokenized = saved_hf_tok.encode(input_str)
 
         assert hf_tokenized == saved_hf_tokenized

From 86f7f979213edc37079d8774adc9a09a9e599822 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 29 May 2024 17:31:05 -0700
Subject: [PATCH 47/49] Make test_config_save save to temp file

---
 tests/hf_olmo/configuration_olmo_test.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/hf_olmo/configuration_olmo_test.py b/tests/hf_olmo/configuration_olmo_test.py
index 10de155d0..8791040ea 100644
--- a/tests/hf_olmo/configuration_olmo_test.py
+++ b/tests/hf_olmo/configuration_olmo_test.py
@@ -1,16 +1,18 @@
 from olmo.config import ModelConfig
+import tempfile
 
 
 def test_config_save(model_path: str):
     from hf_olmo.configuration_olmo import OLMoConfig
 
-    config = ModelConfig(alibi=True)  # default is False
-    hf_config = OLMoConfig(**config.asdict())
+    with tempfile.TemporaryDirectory() as temp_dir:
+        config = ModelConfig(alibi=True)  # default is False
+        hf_config = OLMoConfig(**config.asdict())
 
-    hf_config.save_pretrained(model_path)
-    loaded_hf_config = OLMoConfig.from_pretrained(model_path)
+        hf_config.save_pretrained(temp_dir)
+        loaded_hf_config = OLMoConfig.from_pretrained(temp_dir)
 
-    assert hf_config.to_dict() == loaded_hf_config.to_dict()
+        assert hf_config.to_dict() == loaded_hf_config.to_dict()
 
-    for key, val in config.asdict().items():
-        assert getattr(loaded_hf_config, key) == val
+        for key, val in config.asdict().items():
+            assert getattr(loaded_hf_config, key) == val

From 5b988580cd7897d7eeb774d4111965e261e4a925 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 29 May 2024 17:31:20 -0700
Subject: [PATCH 48/49] Run ruff

---
 tests/hf_olmo/configuration_olmo_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/hf_olmo/configuration_olmo_test.py b/tests/hf_olmo/configuration_olmo_test.py
index 8791040ea..38aa8f94f 100644
--- a/tests/hf_olmo/configuration_olmo_test.py
+++ b/tests/hf_olmo/configuration_olmo_test.py
@@ -1,6 +1,7 @@
-from olmo.config import ModelConfig
 import tempfile
 
+from olmo.config import ModelConfig
+
 
 def test_config_save(model_path: str):
     from hf_olmo.configuration_olmo import OLMoConfig

From f7f0f8893b3ece1314434fd52eeb5496312c25aa Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 29 May 2024 23:37:57 -0700
Subject: [PATCH 49/49] This should respect the default.

---
 scripts/convert_olmo_to_hf_new.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py
index 2fbbb2773..6780915e3 100644
--- a/scripts/convert_olmo_to_hf_new.py
+++ b/scripts/convert_olmo_to_hf_new.py
@@ -162,7 +162,7 @@ def write_model(model_path, input_base_path, tokenizer_path=None, safe_serializa
         pad_token_id=olmo_config["pad_token_id"],
         bos_token_id=None,
         eos_token_id=olmo_config["eos_token_id"],
-        tie_word_embeddings=olmo_config["weight_tying"],
+        tie_word_embeddings=olmo_config.get("weight_tying", True),
         rope_theta=base,
         clip_qkv=olmo_config.get("clip_qkv"),
     )