I am training a NLP Hugging face model in vertex-ai with custom image.
The same code works in local machine.
Here is my code and the error.
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import transformers as tr
from sentence_transformers import SentenceTransformer
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from scipy.special import softmax
import scipy
import random
import pickle
import os
print("package imported completed")
os.environ['TRANSFORMERS_OFFLINE']='1'
os.environ['HF_MLFLOW_LOG_ARTIFACTS']='TRUE'
print("env setup completed")
print( tr.__version__)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using", device)
torch.backends.cudnn.deterministic = True
tr.trainer_utils.set_seed(0)
print("here")
tokenizer = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-large", return_dict=True,local_files_only=True)
model.to(device)
print("Model loaded successfully")
df=pd.read_csv("gs://****bucket***/data.csv")
print("read csv")
# ,engine='openpyxl',sheet_name="master_data"
train_df=df.text.tolist()
print(len(train_df))
train_df=list(set(train_df))
train_df = [x for x in train_df if str(x) != 'nan']
train_df=train_df[:50]
print("Length of training data is \n ",len(train_df))
print("DATA LOADED successfully")
train_encodings = tokenizer(train_df, truncation=True, padding=True, max_length=512, return_tensors="pt")
print("encoding done")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
print("data collector done")
class SEDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings["attention_mask"])
train_data = SEDataset(train_encodings)
print("train data created")
training_args = tr.TrainingArguments(
output_dir='gs://****bucket***/results_mlm_exp1',
overwrite_output_dir=True,
num_train_epochs=2,
per_device_train_batch_size=4,
# per_device_train_batch_size
# per_gpu_train_batch_size
prediction_loss_only=True
# ,save_strategy="epoch"
# ,run_name="MLM_Exp1"
,learning_rate=2e-5
# logging_dir='gs://****bucket***/logs_mlm_exp1', # directory for storing logs
# logging_steps=32000,
)
trainer = tr.Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_data,
)
print("training to start")
trainer.train()
print("model training finished")
trainer.save_model("gs://****bucket***/model_mlm_exp1")
print("training finished")
The error that I get is:
None INFO train data created
None INFO training to start
None ERROR 0%| | 0/8 [00:00<?, ?it/s]train.py:70: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
None ERROR item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
None ERROR /opt/conda/lib/python3.7/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
None ERROR warnings.warn('Was asked to gather along dimension 0, but all '
/var/sitecustomize/sitecustomize.py INFO None
None ERROR 0%| | 0/8 [00:09<?, ?it/s]
Most of them are warning but still my code stops with error.