Here are the steps I did:
- Started an interactive session
interact --mem=10g
- Load Python/3.10.4-GCCcore-11.3.0 with
ml Python/3.10.4-GCCcore-11.3.0
- Create a virtual python environment with
python -m venv ~/env/genslm
- Activate this env
. ~/env/genslm/bin/activate
- Install GenSLM in this env with
pip install git+
- Then, I modified python file forour dataset. is the file I used
import torch
import numpy as np
from import DataLoader
from genslm import GenSLM, SequenceDataset
from Bio import SeqIO
# Initialize GenSLM model with a valid model_id that matches your model's architecture
model_path = '/scratch/ss11645/GenSLM/MLProject/models/'
model = GenSLM('genslm_25M_patric') # This sets up the architecture
custom_model_state = torch.load(model_path, map_location=torch.device('cpu'))
# If the .pt file contains the model state under a specific key, adjust the key accordingly
model.eval() # Prepare the model for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load and prepare your data
fasta_file = '/scratch/ss11645/GenSLM/MLProject/Seperated_files/h3n2.64000.fasta'
sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, 'fasta')]
# Prepare dataset and dataloader
dataset = SequenceDataset(sequences, model.seq_length, model.tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
# Compute embeddings
embeddings = []
with torch.no_grad():
for batch in dataloader:
outputs = model(batch["input_ids"].to(device), batch["attention_mask"].to(device), output_hidden_states=True)
emb = outputs.hidden_states[-1].detach().cpu().numpy()
emb = np.mean(emb, axis=2)
# Concatenate all embeddings
embeddings = np.concatenate(embeddings, axis=0)
# Output the shape of the embeddings array
- I submitted the job using in Sapelo2 cluster
#SBATCH --job-name=embeddings
#SBATCH --partition=bahl_p
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=2gb
#SBATCH --cpus-per-task=4
#SBATCH --time=500:00:00
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err
#SBATCH --mail-type=END,FAIL #Mail events (NONE, BEGIN, END, FAIL, ALL)
ml Python/3.10.4-GCCcore-11.3.0
. ~/env/genslm/bin/activate
python /scratch/ss11645/GenSLM/