I try to use two gpus to train with a little modification of python/train.py. I modified function
solve_step(proto, snapshot, gpus, timing, uid, rank) to solve_step just add some test forward function
solver.test_nets[0].forward() etc.
def solve_step(proto, snapshot, gpus, timing, uid, rank):
caffe.set_mode_gpu()
caffe.set_device(gpus[rank])
caffe.set_solver_count(len(gpus))
caffe.set_solver_rank(rank)
caffe.set_multiprocess(True)
solver = caffe.SGDSolver(proto)
if snapshot and len(snapshot) != 0:
solver.restore(snapshot)
nccl = caffe.NCCL(solver, uid)
nccl.bcast()
if timing and rank == 0:
time(solver, nccl)
else:
solver.add_callback(nccl)
if solver.param.layer_wise_reduce:
solver.net.after_backward(nccl)
niter = solver.param.max_iter
display = solver.param.display
test_iter = 950
test_interval = 200
# 初始化
train_loss = zeros(int(ceil(niter // display)))
test_loss = zeros(int(ceil(niter // test_interval)))
test_acc = zeros(int(ceil(niter // test_interval)))
# 辅助变量
_train_loss = 0;
_test_loss = 0;
_accuracy = 0;
_max_accuracy = 0;
_max_accuracy_iter = 0;
# 进行解算
for it in range(niter):
solver.step(1)
_train_loss += solver.net.blobs['rgb_flow_gating_loss'].data
if it % display == 0:
train_loss[it // display] = _train_loss / display
_train_loss = 0
if it % test_interval == 0:
print '\n my test, train iteration', it
for test_it in range(test_iter):
#print '\n my test, test iteration \n', test_it
solver.test_nets[0].forward()
_test_loss += solver.test_nets[0].blobs['rgb_flow_gating_loss'].data
_accuracy += solver.test_nets[0].blobs['rgb_flow_gating_accuracy'].data
test_loss[it / test_interval] = _test_loss / test_iter
test_acc[it / test_interval] = _accuracy / test_iter
if _max_accuracy < test_acc[it / test_interval]:
_max_accuracy = test_acc[it / test_interval]
_max_accuracy_iter = it
solver.net.save('/home/zhujiagang/temporal-segment-networks/models/ucf101_split_1_gating_three_iter_' + str(it) + '.caffemodel')
print '\nnewly max: _max_accuracy and _max_accuracy_iter', _max_accuracy, _max_accuracy_iter
print '\n_max_accuracy and _max_accuracy_iter', _max_accuracy, _max_accuracy_iter
_test_loss = 0
_accuracy = 0
I've got everything ready including NCCL installation. But When I ran my code, I got
Process Process-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/home/zhujiagang/temporal-segment-networks/multigpu.py", line 152, in solve_step
solver.test_nets[0].forward()IndexError: Index out of rangeIt seems like that Process-2 conflicts with Process-1 in excuating
solver.test_nets[0].forward().I also tested by removing the test code below
solver.step(1), that is only training code is left. The training is going well with two gpus.
How can I train my model with test codes
solver.test_nets[0].forward() and training codes together with multi-gpu without error
?