Ok, I'm running into a new problem now that has to do with the multi-class targets. Since I am doing a multi-class classification problem, I am using
NLLLoss as my loss function. The expected shape of the input is (batch size, number of classes) and I have confirmed that is correct. The expected shape of the target should be (batch size).
Since I am using custom FASTA files, I created MAT files with the sequences and their targets based on the download_data.py script in the regression MPRA example. Based on the documentation for MatFileSampler, the targets get loaded as a (batch size, number of features) matrix -- so in this case a column vector. I suspect this is the reason for my error ("log 1" below), but I'm not sure how to sidestep it since it appears to be built into Selene.
I also tried to write the targets to the MAT file as a list rather than a (number of sequences, 1) column vector, but that gives me a different error ("log 2" below).
Thanks for your help!
Best,
Ryan
LOG 1
Outputs and logs saved to ./Data/SeleneFiles/enhancer_model_outputs/2021-03-10-20-50-47
2021-03-10 20:50:47,412 - Training parameters set: batch size 64, number of steps per 'epoch': 180, maximum number of steps: 80000
2021-03-10 20:50:47,412 - Creating validation dataset.
2021-03-10 20:50:47,416 - 0.0034127235412597656 s to load 1472 validation examples (23 validation batches) to evaluate after each training step.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-12-9d4cdb07b008> in <module>
1 configs = selene_sdk.utils.load_path(os.path.join(output_dir, "train_eval_model.yml"))
----> 2 selene_sdk.utils.parse_configs_and_run(configs, lr=0.001)
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/utils/config_utils.py in parse_configs_and_run(configs, create_subdirectory, lr)
339 "Using a random seed ensures results are reproducible.")
340
--> 341 execute(operations, configs, current_run_output_dir)
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/utils/config_utils.py in execute(operations, configs, output_dir)
186 "evaluate" in operations:
187 train_model.create_test_set()
--> 188 train_model.train_and_validate()
189
190 elif op == "evaluate":
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/train_model.py in train_and_validate(self)
380 for step in range(self._start_step, self.max_steps):
381 t_i = time()
--> 382 train_loss = self.train()
383 t_f = time()
384 time_per_step.append(t_f - t_i)
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/train_model.py in train(self)
464 predictions = self.model(inputs.transpose(1, 2))
--> 465 loss = self.criterion(predictions, targets)
466
467 self.optimizer.zero_grad()
~/miniconda/envs/selene/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/miniconda/envs/selene/lib/python3.7/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
211
212 def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 213 return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
214
215
~/miniconda/envs/selene/lib/python3.7/site-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
2262 .format(input.size(0), target.size(0)))
2263 if dim == 2:
-> 2264 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2265 elif dim == 4:
2266 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: 1D target tensor expected, multi-target not supported
LOG 2
Outputs and logs saved to ./Data/SeleneFiles/enhancer_model_outputs/2021-03-10-21-00-04
2021-03-10 21:00:04,255 - Training parameters set: batch size 64, number of steps per 'epoch': 180, maximum number of steps: 80000
2021-03-10 21:00:04,256 - Creating validation dataset.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-14-9d4cdb07b008> in <module>
1 configs = selene_sdk.utils.load_path(os.path.join(output_dir, "train_eval_model.yml"))
----> 2 selene_sdk.utils.parse_configs_and_run(configs, lr=0.001)
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/utils/config_utils.py in parse_configs_and_run(configs, create_subdirectory, lr)
339 "Using a random seed ensures results are reproducible.")
340
--> 341 execute(operations, configs, current_run_output_dir)
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/utils/config_utils.py in execute(operations, configs, output_dir)
181 train_model_info.bind(output_dir=output_dir)
182
--> 183 train_model = instantiate(train_model_info)
184 # TODO: will find a better way to handle this in the future
185 if "load_test_set" in configs and configs["load_test_set"] and \
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/utils/config.py in instantiate(proxy, bindings)
237 bindings = {}
238 if isinstance(proxy, _Proxy):
--> 239 return _instantiate_proxy_tuple(proxy, bindings)
240 elif isinstance(proxy, dict):
241 # Recurse on the keys too, for backward compatibility.
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/utils/config.py in _instantiate_proxy_tuple(proxy, bindings)
142 kwargs = dict((k, instantiate(v, bindings))
143 for k, v in six.iteritems(proxy.keywords))
--> 144 obj = proxy.callable(**kwargs)
145 try:
146 obj.yaml_src = proxy.yaml_src
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/train_model.py in __init__(self, model, data_sampler, loss_criterion, optimizer_class, optimizer_kwargs, batch_size, max_steps, report_stats_every_n_steps, output_dir, save_checkpoint_every_n_steps, save_new_checkpoints_after_n_steps, report_gt_feature_n_positives, n_validation_samples, n_test_samples, cpu_n_threads, use_cuda, data_parallel, logging_verbosity, checkpoint_resume, metrics)
243 verbosity=logging_verbosity)
244
--> 245 self._create_validation_set(n_samples=n_validation_samples)
246 self._validation_metrics = PerformanceMetrics(
247 self.sampler.get_feature_from_index,
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/train_model.py in _create_validation_set(self, n_samples)
312 self._validation_data, self._all_validation_targets = \
313 self.sampler.get_validation_set(
--> 314 self.batch_size, n_samples=n_samples)
315 t_f = time()
316 logger.info(("{0} s to load {1} validation examples ({2} validation "
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/samplers/multi_file_sampler.py in get_validation_set(self, batch_size, n_samples)
170 """
171 return self._samplers["validate"].get_data_and_targets(
--> 172 batch_size, n_samples)
173
174 def get_test_set(self, batch_size, n_samples=None):
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/samplers/file_samplers/mat_file_sampler.py in get_data_and_targets(self, batch_size, n_samples)
243 count = batch_size
244 while count < n_samples:
--> 245 seqs, tgts = self.sample(batch_size=batch_size)
246 sequences_and_targets.append((seqs, tgts))
247 targets_mat.append(tgts)
~/miniconda/envs/selene/lib/python3.7/site-packages/selene_sdk/samplers/file_samplers/mat_file_sampler.py in sample(self, batch_size)
162 if self._sample_tgts is not None:
163 if self._tgts_batch_axis == 0:
--> 164 targets = self._sample_tgts[use_indices, :].astype(float)
165 else:
166 targets = self._sample_tgts[:, use_indices].astype(float)
IndexError: index 1 is out of bounds for axis 0 with size 1