Yes, reproducible distributed random numbers can be tricky business.
Here is some IPython code that shows the problems and a likely suboptimal solution. For gensim, you could possibly have the seed set by the use and/or, depending on what you want, the hash of the chunk or something along these lines - provided the chunks are always the same and they are hashable (or can be converted to be). I'm not sure the performance hit here, but it might be negligible or unimportant.
import numpy as np
from IPython.parallel import Client
def random_array(size):
array = np.random.random(size)
return array
def random_array_stateful(size, prng):
array = prng.rand(size)
return array
def random_array_naive_solution(size, prng, seed):
prng.seed(seed)
array = prng.rand(size)
return array
rc = Client()
dview = rc[:]
dview.execute("import numpy as np")
# not reproducible, doesn't know about global state
np.random.seed(12345)
p_res = dview.map_sync(random_array, [5, 5, 5, 5, 5])
np.random.seed(12345)
p_res2 = dview.map_sync(random_array, [5, 5, 5, 5, 5])
# gives same values for each one
prng = np.random.RandomState(12345)
p_res3 = dview.map_sync(random_array_stateful, [5, 5, 5, 5, 5], [prng]*5)
prng.seed(12345)
p_res4 = dview.map_sync(random_array_stateful, [5, 5, 5, 5, 5], [prng]*5)
# naive "solution"
prng.seed(12345)
seeds = prng.randint(0, 1e6, size=5)
p_res5 = dview.map_sync(random_array_naive_solution, [5, 5, 5, 5, 5], [prng]*5,
seeds)
p_res6 = dview.map_sync(random_array_naive_solution, [5, 5, 5, 5, 5], [prng]*5,
seeds)