I'm a lead PHP engineer overseeing some improvements to a system that formerly used an arcane concoction of randomize functions to pick from a set of things and distribute things to one of the randomly-picked items in an array.
I re-architected the randomization functions to use a Redis list and an RPOPLPUSH operation with the same key twice (to make a cycling ring of items), because in effect, the distributions for our business model are not and should not be random at all, but highly controlled.
Anyway, so we're deployed on AWS using Redis in ElastiCache. We have a 50/50 split distribution in Redis list:
and so on.
Here's where it gets really bizarre.
Two days in a row, I've come into work to a "it doesn't work again" bug report and find out the list is empty.
I am 99.9% certain that no code in our entire system is actually supposed to be writing to this key - I manually lpush'd the values into it. The only operation is the RPOPLPUSH as illustrated above.
As soon as I run lines 1 and 2 again, everything is fine and dandy.
I tried a (blocking) BRPOPLPUSH but it took like 20 seconds to complete and we have not even placed the system under any kind of load so I switched it back to a non-blocking operation. Only a handful of users are actually performing the function that calls the code where the RPOPLPUSH takes place, so I find it really hard to believe it's even a concurrency issue, but still, I was under the impression that "all operations are atomic."
It would seem to reason that (in other crappier data stores) that if the POP is occurring, but maybe the PUSH isn't, that could "empty the list"... but that's not possible in Redis lists, right?
>>> Redis::info()
=> [
"Server" => [
"redis_version" => "3.2.4",
"redis_git_sha1" => "0",
"redis_git_dirty" => "0",
"redis_build_id" => "0",
"redis_mode" => "standalone",
"os" => "Amazon ElastiCache",
"arch_bits" => "64",
"multiplexing_api" => "epoll",
"gcc_version" => "0.0.0",
"process_id" => "1",
"run_id" => "aa30fb0d9d22003abe84c88288281e4ae8bb466e",
"tcp_port" => "6379",
"uptime_in_seconds" => "40503574",
"uptime_in_days" => "468",
"hz" => "10",
"lru_clock" => "6413282",
"executable" => "-",
"config_file" => "-",
],
"Clients" => [
"connected_clients" => "85",
"client_longest_output_list" => "0",
"client_biggest_input_buf" => "0",
"blocked_clients" => "0",
],
"Memory" => [
"used_memory" => "6355376",
"used_memory_human" => "6.06M",
"used_memory_rss" => "22966272",
"used_memory_rss_human" => "21.90M",
"used_memory_peak" => "1090178424",
"used_memory_peak_human" => "1.02G",
"used_memory_lua" => "36864",
"used_memory_lua_human" => "36.00K",
"maxmemory" => "3461349376",
"maxmemory_human" => "3.22G",
"maxmemory_policy" => "volatile-lru",
"mem_fragmentation_ratio" => "3.61",
"mem_allocator" => "jemalloc-4.0.3",
],
"Persistence" => [
"loading" => "0",
"rdb_changes_since_last_save" => "46398029",
"rdb_bgsave_in_progress" => "0",
"rdb_last_save_time" => "1492636364",
"rdb_last_bgsave_status" => "ok",
"rdb_last_bgsave_time_sec" => "-1",
"rdb_current_bgsave_time_sec" => "-1",
"aof_enabled" => "0",
"aof_rewrite_in_progress" => "0",
"aof_rewrite_scheduled" => "0",
"aof_last_rewrite_time_sec" => "-1",
"aof_current_rewrite_time_sec" => "-1",
"aof_last_bgrewrite_status" => "ok",
"aof_last_write_status" => "ok",
],
"Stats" => [
"total_connections_received" => "5993993",
"total_commands_processed" => "381545021",
"instantaneous_ops_per_sec" => "16",
"total_net_input_bytes" => "21032777213",
"total_net_output_bytes" => "61084348387",
"instantaneous_input_kbps" => "0.83",
"instantaneous_output_kbps" => "4.58",
"rejected_connections" => "0",
"sync_full" => "0",
"sync_partial_ok" => "0",
"sync_partial_err" => "0",
"expired_keys" => "4062869",
"evicted_keys" => "0",
"keyspace_hits" => "15367057",
"keyspace_misses" => "298696121",
"pubsub_channels" => "0",
"pubsub_patterns" => "0",
"latest_fork_usec" => "0",
"migrate_cached_sockets" => "0",
],
"Replication" => [
"role" => "master",
"connected_slaves" => "0",
"master_repl_offset" => "0",
"repl_backlog_active" => "0",
"repl_backlog_size" => "1048576",
"repl_backlog_first_byte_offset" => "0",
"repl_backlog_histlen" => "0",
],
"CPU" => [
"used_cpu_sys" => "28082.15",
"used_cpu_user" => "33219.75",
"used_cpu_sys_children" => "0.00",
"used_cpu_user_children" => "0.00",
],
"Cluster" => [
"cluster_enabled" => "0",
],
"Keyspace" => [
"db0" => [
"keys" => "3776",
"expires" => "4",
"avg_ttl" => "37148",
],
],
]