-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.py
161 lines (137 loc) · 5.22 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import torch
import os
import random
import argparse
import numpy as np
from IPython import get_ipython
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from MODEL.model import DiffusionLLM, DiffusionConfig
from trainer import trainer
from model_save import load_model
from datasetANDtokenizer import prepare_dataset
from register_model import registerANDpush
parser = argparse.ArgumentParser(description="Train LLaDA model")
parser.add_argument("--dataset", type=str, default="wikitext/wikitext-103-v1", help="Dataset name")
parser.add_argument("--tokenizer", type=str, default="gpt2", help="Tokenizer name")
parser.add_argument("--model_size", type=str, default="small", help="Model size (small, medium, large)")
parser.add_argument("--batch_size", type=int, default=8, help="Batch size")
parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate")
parser.add_argument("--max_length", type=int, default=256, help="Maximum sequence length")
parser.add_argument("--save_dir", type=str, default="./saved_models", help="Directory to save models")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
parser.add_argument("--num_timesteps", type=int, default=100, help="Number of diffusion timesteps")
parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of inference steps")
parser.add_argument("--load_path", type=str, default=None, help="Path to load model from")
parser.add_argument("--cache_dir", type=str, default=None, help="Cache directory")
parser.add_argument("--num_proc", type=int, default=4, help="Number of processes for tokenization")
parser.add_argument("--push_to_hub", type=bool, default=False, help="Push model to Hugging Face Hub")
if get_ipython() is not None:
# If in Jupyter, use default arguments
args = parser.parse_args([]) # Pass empty list to parse_args
else:
# If in command-line, parse arguments normally
args = parser.parse_args()
# Set random seed
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
# Create save directory if it doesn't exist
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
# Prepare dataset
print(f"Loading dataset {args.dataset}...")
train_dataset, val_dataset, tokenizer = prepare_dataset(
dataset_name=args.dataset,
tokenizer_name=args.tokenizer,
max_length=args.max_length,
cache_dir=args.cache_dir,
num_proc=args.num_proc
)
print(f"Dataset loaded. Train size: {len(train_dataset)}")
# used the portions data of training
train_dataset = torch.utils.data.Subset(train_dataset, range(int(len(train_dataset) * 0.1)))
# print(f"Train Data size: {len(train_dataset)}")
if val_dataset:
print(f"Validation size: {len(val_dataset)}")
val_dataset = torch.utils.data.Subset(val_dataset, range(int(len(val_dataset) * 0.1)))
print(f"Validation Data size: {len(val_dataset)}")
# Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Configure model size
model_configs = {
"small": {
"hidden_size": 256,
"num_hidden_layers": 4,
"num_attention_heads": 4,
},
"medium": {
"hidden_size": 512,
"num_hidden_layers": 8,
"num_attention_heads": 8,
},
"large": {
"hidden_size": 1024,
"num_hidden_layers": 16,
"num_attention_heads": 16,
}
}
config_kwargs = model_configs.get(args.model_size, model_configs["small"])
# Create or load model
if args.load_path is not None:
print(f"Loading model from {args.load_path}...")
model, optimizer = load_model(args.load_path, device)
else:
print("Creating new model...")
# Create config with appropriate vocabulary size
config = DiffusionConfig(
vocab_size=len(tokenizer),
max_position_embeddings=args.max_length,
num_timesteps=args.num_timesteps,
pad_token_id=tokenizer.pad_token_id,
mask_token_id=tokenizer.mask_token_id,
**config_kwargs
)
model = DiffusionLLM(config)
# Log number of parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Model has {total_params:,} parameters")
# Train model
print("Starting training...")
train_model = trainer(
model=model,
train_dataset=train_dataset,
val_dataset=val_dataset,
batch_size=args.batch_size,
num_epochs=args.num_epochs,
learning_rate=args.learning_rate,
num_timesteps=args.num_timesteps,
save_path=args.save_dir,
device=device,
)
print("Training completed!")
# push model to hub
if args.push_to_hub:
registerANDpush(model,
tokenizer,
"diffusionLM",
DiffusionLLM,
DiffusionConfig,
repo_id="codewithdark/DiffusionLM")
# Path: utils/pipeline.py \
# dataset_name wikitext/wikitext-103-v1 \
# tokenizer gpt2 \
# model_size small \
# batch_size 8 \
# num_epochs 1 \
# learning_rate 5e-5 \
# max_length 256 \
# save_dir ./saved_models \
# seed 42 \
# num_timesteps 100 \
# num_inference_steps 50 \
# cache_dir None \
# num_proc 4
# push_to_hub True