Skip to content

Commit f798a19

Browse files
committed
wip(generate + eda): working generation + add initial eda
1 parent 4c2fce6 commit f798a19

File tree

8 files changed

+1548
-18
lines changed

8 files changed

+1548
-18
lines changed

agent-traces/README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
export UV_LINK_MODE=copy
2+
uv venv /fsx/baptiste_colle/open-r1-venv --python 3.10
3+
uv pip install -r baptiste/requirements.txt
4+
5+
6+
uv venv /fsx/baptiste_colle/open-r1-test-venv --python 3.10
7+
8+
sbatch slurm/agentic_generation.slurm
9+
10+
sbatch slurm/test_generate.slurm
11+
12+
13+
squeue -u $USER
14+
15+
scontrol show job 15678390
16+
17+
18+
sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
19+
20+
21+
curl http://10.53.86.164:39876/v1/chat/completions \
22+
-H "Content-Type: application/json" \
23+
-d '{"model": "default", "messages": [{"role": "user", "content": "What is the capital of France?"}], "max_tokens": 32}'
24+
25+
26+
sbatch slurm/serve_router.slurm
27+
28+
http://10.53.95.152:39876/v1/chat/completions
29+
30+
31+
ROUTER_ADDRESS="10.53.86.164:39876"
32+
FIRST_NODE_IP="26.0.174.186"
33+
SERVER_PORT="39877"
34+
35+
curl -X POST "http://${ROUTER_ADDRESS}/add_worker?url=http://${FIRST_NODE_IP}:${SERVER_PORT}"
36+
37+
38+
39+
40+
sbatch slurm/serve_router.slurm
41+
sbatch slurm/serve_r1.slurm # do not forget to add the router address
42+
43+
sbatch slurm/agentic_generation.slurm
44+
45+
46+
cp codeforces_agentic_generations.jsonl codeforces_agentic_generations_backup_$(date +%Y%m%d_%H%M%S).jsonl
47+

agent-traces/eda.ipynb

Lines changed: 1463 additions & 0 deletions
Large diffs are not rendered by default.

agent-traces/requirements.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
argparse
2+
datasets
3+
tqdm
4+
requests
5+
python-dotenv
6+
transformers==4.49.0
7+
smolagents
8+
hf_transfer
9+
ipykernel
10+
ipywidgets

scripts/generate_agent_traces.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@
2121
from smolagents import CodeAgent, Tool
2222
from smolagents.models import get_clean_message_list
2323

24-
from dotenv import load_dotenv
25-
26-
load_dotenv()
2724
file_lock = Lock()
2825

2926
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
@@ -225,18 +222,21 @@ def main():
225222
parser.add_argument("--prompt-column", type=str, required=True)
226223
parser.add_argument("--uuid-column", type=str, required=True)
227224
parser.add_argument("--api-addr", type=str, default="localhost:39876")
228-
parser.add_argument("--num-generations", type=int, default=4)
225+
parser.add_argument("--num-generations", type=int, default=5)
229226
parser.add_argument("--temperature", type=float, default=0.6)
230227
parser.add_argument("--top-p", type=float, default=0.95)
231228
parser.add_argument("--max-tokens", type=int, default=8096)
232229
parser.add_argument("--max-concurrent", type=int, default=1000)
233230
args = parser.parse_args()
234-
231+
232+
subset = ""
233+
# subset = "[:10]"
234+
seed = 42
235+
235236
dataset = load_dataset(
236237
"open-r1/codeforces-test-cases",
237-
split="train",
238-
token=os.getenv("HF_TOKEN")
239-
).shuffle()
238+
split=f"train{subset}",
239+
).shuffle(seed=seed)
240240
dataset = dataset.filter(lambda x: x["full_test_set"])
241241

242242
processed_uuids = load_processed_uuids(args.output_file, args.uuid_column)
@@ -251,6 +251,10 @@ def main():
251251
if not output_path.exists():
252252
with open(args.output_file, mode="w") as f:
253253
f.write("")
254+
255+
# print(f"Processing using {args.max_concurrent} workers")
256+
# print(f"Using ip {args.api_addr}")
257+
254258

255259
# Create a session that will be shared among threads
256260
session = requests.Session()

slurm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
conda create -n sglang124 python=3.11
66
conda activate sglang124
77

8-
pip install torch=2.5.1 --index-url https://download.pytorch.org/whl/cu124
8+
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
99

1010
pip install sgl-kernel --force-reinstall --no-deps
1111
pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/

slurm/agentic_generation.slurm

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
#!/bin/bash
22
#SBATCH --job-name=agentic-r1
3-
#SBATCH --gres=gpu:8
4-
#SBATCH --partition=hopper-prod
3+
#SBATCH --partition=hopper-cpu
54
#SBATCH --qos=high
65
#SBATCH --nodes=1
6+
#SBATCH --cpus-per-task=64
7+
#SBATCH --exclusive
78
#SBATCH --output=./logs/%x_%j_%n.out
89
#SBATCH --error=./logs/%x_%j_%n.err
910
#SBATCH --time=7-00:00:00
1011
set -exuo pipefail
1112

1213
source ~/.bashrc
13-
source $(conda info --base)/etc/profile.d/conda.sh
14-
conda activate /fsx/aymeric/venv
14+
source /fsx/baptiste_colle/open-r1-venv/bin/activate
1515

1616
python scripts/generate_agent_traces.py \
17-
--output-file "codeforces_agentic_generations.jsonl" \
17+
--output-file "data/codeforces_agentic_generations.jsonl" \
1818
--prompt-column "prompt" \
1919
--uuid-column "contestId" \
20-
--api-addr "10.53.83.199:39876" \
20+
--api-addr "10.53.86.164:39876" \
2121
--num-generations 5 \
2222
--max-tokens 8096 \
2323
--max-concurrent 64

slurm/serve_r1.slurm

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#SBATCH --qos=normal
55
#SBATCH --nodes=2
66
#SBATCH --gpus-per-node=8
7+
#SBATCH --mem-per-cpu=1875m
78
#SBATCH --exclusive
89
#SBATCH --output=./logs/%x_%j_%n.out
910
#SBATCH --error=./logs/%x_%j_%n.err
@@ -14,7 +15,7 @@ set -exuo pipefail
1415

1516
MODEL_PATH="deepseek-ai/DeepSeek-R1"
1617
CONDA_ENV="sglang124"
17-
ROUTER_ADDRESS=""
18+
ROUTER_ADDRESS="10.53.86.164:39876"
1819
SERVER_PORT=39877
1920
DIST_PORT=45000
2021

@@ -36,7 +37,7 @@ done
3637
# TODO: Environment setup, adjust to your cluster configuration
3738
module load cuda/12.4
3839
source ~/.bashrc
39-
source "$CONDA_PREFIX/etc/profile.d/conda.sh"
40+
source "$HOME/miniconda3/etc/profile.d/conda.sh"
4041
conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
4142

4243
FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)

slurm/serve_router.slurm

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,14 @@ done
2727

2828
# TODO: Environment setup, adjust to your cluster configuration
2929
source ~/.bashrc
30-
source "$CONDA_PREFIX/etc/profile.d/conda.sh"
30+
source "$HOME/miniconda3/etc/profile.d/conda.sh"
3131
conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
3232

33+
# FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
34+
# FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address)
35+
36+
# echo "Router IP: $FIRST_NODE_IP"
37+
3338
python -m sglang_router.launch_router \
3439
--port "$ROUTER_PORT" \
3540
--host 0.0.0.0 \

0 commit comments

Comments
 (0)