wip(generate + eda): working generation + add initial eda

baptistecolle · baptistecolle · commit f798a1920aed · 2025-03-31T11:33:55.000Z
diff --git a/agent-traces/README.md b/agent-traces/README.md
@@ -0,0 +1,47 @@
+export UV_LINK_MODE=copy 
+uv venv /fsx/baptiste_colle/open-r1-venv --python 3.10 
+uv pip install -r baptiste/requirements.txt
+
+
+uv venv /fsx/baptiste_colle/open-r1-test-venv --python 3.10 
+
+sbatch slurm/agentic_generation.slurm
+
+sbatch slurm/test_generate.slurm
+
+
+squeue -u $USER
+
+scontrol show job 15678390
+
+
+sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
+
+
+curl http://10.53.86.164:39876/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "default", "messages": [{"role": "user", "content": "What is the capital of France?"}], "max_tokens": 32}'
+
+
+sbatch slurm/serve_router.slurm
+
+http://10.53.95.152:39876/v1/chat/completions
+
+
+ROUTER_ADDRESS="10.53.86.164:39876"
+FIRST_NODE_IP="26.0.174.186"
+SERVER_PORT="39877"
+
+curl -X POST "http://${ROUTER_ADDRESS}/add_worker?url=http://${FIRST_NODE_IP}:${SERVER_PORT}"
+
+
+
+
+sbatch slurm/serve_router.slurm
+sbatch slurm/serve_r1.slurm # do not forget to add the router address
+
+sbatch slurm/agentic_generation.slurm
+
+
+cp codeforces_agentic_generations.jsonl codeforces_agentic_generations_backup_$(date +%Y%m%d_%H%M%S).jsonl
+ 
diff --git a/agent-traces/eda.ipynb b/agent-traces/eda.ipynb
diff --git a/agent-traces/requirements.txt b/agent-traces/requirements.txt
@@ -0,0 +1,10 @@
+argparse
+datasets
+tqdm
+requests
+python-dotenv
+transformers==4.49.0
+smolagents
+hf_transfer
+ipykernel
+ipywidgets
diff --git a/scripts/generate_agent_traces.py b/scripts/generate_agent_traces.py
@@ -21,9 +21,6 @@
 from smolagents import CodeAgent, Tool
 from smolagents.models import get_clean_message_list
 
-from dotenv import load_dotenv
-
-load_dotenv()
 file_lock = Lock()
 
 tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
@@ -225,18 +222,21 @@ def main():
     parser.add_argument("--prompt-column", type=str, required=True)
     parser.add_argument("--uuid-column", type=str, required=True)
     parser.add_argument("--api-addr", type=str, default="localhost:39876")
-    parser.add_argument("--num-generations", type=int, default=4)
+    parser.add_argument("--num-generations", type=int, default=5)
     parser.add_argument("--temperature", type=float, default=0.6)
     parser.add_argument("--top-p", type=float, default=0.95)
     parser.add_argument("--max-tokens", type=int, default=8096)
     parser.add_argument("--max-concurrent", type=int, default=1000)
     args = parser.parse_args()
-
+    
+    subset = ""
+    # subset = "[:10]"
+    seed = 42
+    
     dataset = load_dataset(
         "open-r1/codeforces-test-cases",
-        split="train",
-        token=os.getenv("HF_TOKEN")
-    ).shuffle()
+        split=f"train{subset}",
+    ).shuffle(seed=seed)
     dataset = dataset.filter(lambda x: x["full_test_set"])
     
     processed_uuids = load_processed_uuids(args.output_file, args.uuid_column)
@@ -251,6 +251,10 @@ def main():
     if not output_path.exists():
         with open(args.output_file, mode="w") as f:
             f.write("")
+            
+    # print(f"Processing using {args.max_concurrent} workers")
+    # print(f"Using ip {args.api_addr}")
+    
 
     # Create a session that will be shared among threads
     session = requests.Session()
diff --git a/slurm/README.md b/slurm/README.md
@@ -5,7 +5,7 @@
 conda create -n sglang124 python=3.11
 conda activate sglang124
 
-pip install torch=2.5.1 --index-url https://download.pytorch.org/whl/cu124
+pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
 
 pip install sgl-kernel --force-reinstall --no-deps
 pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
diff --git a/slurm/agentic_generation.slurm b/slurm/agentic_generation.slurm
@@ -1,23 +1,23 @@
 #!/bin/bash
 #SBATCH --job-name=agentic-r1
-#SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod
+#SBATCH --partition=hopper-cpu
 #SBATCH --qos=high
 #SBATCH --nodes=1
+#SBATCH --cpus-per-task=64
+#SBATCH --exclusive
 #SBATCH --output=./logs/%x_%j_%n.out
 #SBATCH --error=./logs/%x_%j_%n.err
 #SBATCH --time=7-00:00:00
 set -exuo pipefail
 
 source ~/.bashrc
-source $(conda info --base)/etc/profile.d/conda.sh
-conda activate /fsx/aymeric/venv
+source /fsx/baptiste_colle/open-r1-venv/bin/activate
 
 python scripts/generate_agent_traces.py \
-    --output-file "codeforces_agentic_generations.jsonl" \
+    --output-file "data/codeforces_agentic_generations.jsonl" \
     --prompt-column "prompt" \
     --uuid-column "contestId" \
-    --api-addr "10.53.83.199:39876" \
+    --api-addr "10.53.86.164:39876" \
     --num-generations 5 \
     --max-tokens 8096 \
     --max-concurrent 64
diff --git a/slurm/serve_r1.slurm b/slurm/serve_r1.slurm
@@ -4,6 +4,7 @@
 #SBATCH --qos=normal
 #SBATCH --nodes=2
 #SBATCH --gpus-per-node=8
+#SBATCH --mem-per-cpu=1875m
 #SBATCH --exclusive
 #SBATCH --output=./logs/%x_%j_%n.out
 #SBATCH --error=./logs/%x_%j_%n.err
@@ -14,7 +15,7 @@ set -exuo pipefail
 
 MODEL_PATH="deepseek-ai/DeepSeek-R1"
 CONDA_ENV="sglang124"
-ROUTER_ADDRESS=""
+ROUTER_ADDRESS="10.53.86.164:39876"
 SERVER_PORT=39877
 DIST_PORT=45000
 
@@ -36,7 +37,7 @@ done
 # TODO: Environment setup, adjust to your cluster configuration
 module load cuda/12.4
 source ~/.bashrc
-source "$CONDA_PREFIX/etc/profile.d/conda.sh"
+source "$HOME/miniconda3/etc/profile.d/conda.sh"
 conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
 
 FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
diff --git a/slurm/serve_router.slurm b/slurm/serve_router.slurm
@@ -27,9 +27,14 @@ done
 
 # TODO: Environment setup, adjust to your cluster configuration
 source ~/.bashrc
-source "$CONDA_PREFIX/etc/profile.d/conda.sh"
+source "$HOME/miniconda3/etc/profile.d/conda.sh"
 conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
 
+# FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
+# FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address)
+
+# echo "Router IP: $FIRST_NODE_IP"
+
 python -m sglang_router.launch_router \
     --port "$ROUTER_PORT" \
     --host 0.0.0.0 \