[flaxcode] r1336 committed - added poc

9 views

Skip to first unread message

codesite...@google.com

unread,

Jun 22, 2012, 5:48:21 AM6/22/12

to flax-c...@googlegroups.com

Revision: 1336
Author: banoffi
Date: Fri Jun 22 02:46:58 2012
Log: added poc
http://code.google.com/p/flaxcode/source/detail?r=1336

Added:
/trunk/LuceneRedisCodec
/trunk/LuceneRedisCodec/poc
/trunk/LuceneRedisCodec/poc/ExternalFieldCodec.iml
/trunk/LuceneRedisCodec/poc/README
/trunk/LuceneRedisCodec/poc/build.xml
/trunk/LuceneRedisCodec/poc/etc
/trunk/LuceneRedisCodec/poc/etc/redis.conf
/trunk/LuceneRedisCodec/poc/lib

/trunk/LuceneRedisCodec/poc/lib/jredis-core-ri-a.0-SNAPSHOT-jar-with-dependencies.jar
/trunk/LuceneRedisCodec/poc/lib/junit-4.8.2.jar
/trunk/LuceneRedisCodec/poc/lib/lucene-analyzers-common-4.0-SNAPSHOT.jar
/trunk/LuceneRedisCodec/poc/lib/lucene-core-4.0-SNAPSHOT.jar
/trunk/LuceneRedisCodec/poc/oldsrc
/trunk/LuceneRedisCodec/poc/oldsrc/externalcodec
/trunk/LuceneRedisCodec/poc/oldsrc/org
/trunk/LuceneRedisCodec/poc/oldsrc/org/apache
/trunk/LuceneRedisCodec/poc/oldsrc/org/apache/lucene
/trunk/LuceneRedisCodec/poc/oldsrc/org/apache/lucene/index

/trunk/LuceneRedisCodec/poc/oldsrc/org/apache/lucene/index/UpdatingIndexWriter.java
/trunk/LuceneRedisCodec/poc/oldsrc/uk
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/Example.java

/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/ExternalCodec.java

/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/ExternalPostingsFormat.java

/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/UpdatingCodecMergePolicy.java
/trunk/LuceneRedisCodec/poc/src
/trunk/LuceneRedisCodec/poc/src/java
/trunk/LuceneRedisCodec/poc/src/java/org
/trunk/LuceneRedisCodec/poc/src/java/org/apache
/trunk/LuceneRedisCodec/poc/src/java/org/apache/lucene
/trunk/LuceneRedisCodec/poc/src/java/org/apache/lucene/index
/trunk/LuceneRedisCodec/poc/src/java/uk
/trunk/LuceneRedisCodec/poc/src/java/uk/co
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/Diff.java

/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingFieldsConsumer.java

/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingFieldsProducer.java

/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingPostingsFormat.java

/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingPostingsWriter.java

/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingTermsWriter.java
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/Updater.java
/trunk/LuceneRedisCodec/poc/src/resources
/trunk/LuceneRedisCodec/poc/src/resources/META-INF
/trunk/LuceneRedisCodec/poc/src/resources/META-INF/services

/trunk/LuceneRedisCodec/poc/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
/trunk/LuceneRedisCodec/poc/src/test
/trunk/LuceneRedisCodec/poc/src/test/uk
/trunk/LuceneRedisCodec/poc/src/test/uk/co
/trunk/LuceneRedisCodec/poc/src/test/uk/co/flax
/trunk/LuceneRedisCodec/poc/src/test/uk/co/flax/rediscodec
/trunk/LuceneRedisCodec/poc/src/test/uk/co/flax/rediscodec/TestCodec.java

=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/ExternalFieldCodec.iml Fri Jun 22 02:46:58
2012
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+ <component name="NewModuleRootManager" inherit-compiler-output="true">
+ <exclude-output />
+ <content url="file://$MODULE_DIR$">
+ <sourceFolder url="file://$MODULE_DIR$/src/java"
isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/src/resources"
isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true"
/>
+ </content>
+ <orderEntry type="inheritedJdk" />
+ <orderEntry type="sourceFolder" forTests="false" />
+ <orderEntry type="library" name="lucene-core-4.0-SNAPSHOT"
level="project" />
+ <orderEntry type="library" name="lucene-analyzers-common-4.0-SNAPSHOT"
level="project" />
+ <orderEntry type="module-library" scope="TEST">
+ <library>
+ <CLASSES>
+ <root url="jar://$APPLICATION_HOME_DIR$/lib/junit-4.10.jar!/" />
+ </CLASSES>
+ <JAVADOC />
+ <SOURCES />
+ </library>
+ </orderEntry>
+ <orderEntry type="library"
name="jredis-core-ri-a.0-SNAPSHOT-jar-with-dependencies" level="project" />
+ </component>
+</module>
+
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/README Fri Jun 22 02:46:58 2012
@@ -0,0 +1,45 @@
+So here's a sketched solution:
+
+The updateable term postings lists are stored as a binary blob in redis,
keyed
+by term and segment, decodable into a list of integer docids.
+ seg_term = 0, 1, 5, 23, ...
+
+We create a new Codec that can a) fetch this list from a redis instance and
+create DocsEnum iterators over it, and b) write merged segment postings
back out
+to redis.
+
+Updates are not done through the codec, however. A separate update tool
talks
+directly to redis and makes changes to the postings lists without talking
to
+lucene.
+ Given a docid and a list of terms to remove and add:
+ - find the segment and internal docid of the document
+ - edit the postings lists of each term for that segment
+Might want to do edits in batches, depending on how efficient this turns
out to
+be.
+
+You'll need to be running a redis instance for the test to work.
+ $ apt-get install redis
+ $ redis-server poc/etc/redis.conf
+
+test/uk/co/flax/rediscodec/TestCodec illustrates the process. You can run
it from
+the command line as follows:
+ $ ant test
+
+
+Caveats:
+ - This is a hacked-together proof of concept, not a piece of production
software! As
+ such, the codec is sorely incomplete. It does not store positions or
doc frequencies,
+ and its postings format is extremely inefficient. You can only store a
+ single field, and it has to be called 'tag'. You can only use a redis
server
+ running on localhost against the default port. And so on, and so
forth. All of these
+ can of course be improved.
+
+ - Segment merges should be dealt with without a problem here, but we will
need to ensure
+ that we somehow integrate the Updater with the Lucene index locking
machinery. At the
+ moment it is possible to run the updater while a merge is happening,
which will result
+ in a corrupt index.
+
+ - Redis will back up its state to disk periodically, so that we can
recover indexes in
+ the event of a system crash. It's probably worth investigating the
various settings
+ properly here though, to ensure no data loss. Maybe telling redis to
dump the db
+ state after every update?
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/build.xml Fri Jun 22 02:46:58 2012
@@ -0,0 +1,62 @@
+
+
+
+<project name="rediscodec">
+
+ <description>
+ Proof-of-concept project illustrating a Lucene codec using redis
+ </description>
+
+ 
+ <property name="javasrc" location="src/java"/>
+ <property name="testsrc" location="src/test"/>
+ <property name="resources" location="src/resources"/>
+
+ 
+ <property name="class-dir"
location="out/production/ExternalFieldCodec"/>
+ <property name="test-class-dir"
location="out/test/ExternalFieldCodec"/>
+ <property name="test-report-dir" location="out/test/reports"/>
+
+ 
+ <path id="classpath.base">
+ <fileset dir="lib">
+ <include name="**/*.jar"/>
+ </fileset>
+ </path>
+
+ <path id="classpath.test">
+ <pathelement location="${class-dir}"/>
+ <path refid="classpath.base"/>
+ </path>
+
+ <target name="init">
+ <mkdir dir="${class-dir}"/>
+ <mkdir dir="${test-class-dir}"/>
+ </target>
+
+ <target name="compile" description="Compile sources">
+ <javac srcdir="${javasrc}" destdir="${class-dir}" encoding="UTF-8">
+ <classpath refid="classpath.base"/>
+ </javac>
+ <copy todir="${class-dir}">
+ <fileset dir="${resources}"/>
+ </copy>
+ </target>
+
+ <target name="compile-test" depends="compile" description="Compile
test sources">
+ <javac srcdir="${testsrc}" destdir="${test-class-dir}"
encoding="UTF-8">
+ <classpath refid="classpath.test"/>
+ </javac>
+ </target>
+
+ <target name="test" depends="compile-test" description="Run tests">
+ <junit printsummary="yes" fork="yes" showoutput="yes">
+ <classpath>
+ <path location="${test-class-dir}"/>
+ <path refid="classpath.test"/>
+ </classpath>
+ <test name="uk.co.flax.rediscodec.TestCodec"/>
+ </junit>
+ </target>
+
+</project>
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/etc/redis.conf Fri Jun 22 02:46:58 2012
@@ -0,0 +1,492 @@
+# Redis configuration file example
+
+# Note on units: when memory size is needed, it is possible to specifiy
+# it in the usual form of 1k 5GB 4M and so forth:
+#
+# 1k => 1000 bytes
+# 1kb => 1024 bytes
+# 1m => 1000000 bytes
+# 1mb => 1024*1024 bytes
+# 1g => 1000000000 bytes
+# 1gb => 1024*1024*1024 bytes
+#
+# units are case insensitive so 1GB 1Gb 1gB are all the same.
+
+# By default Redis does not run as a daemon. Use 'yes' if you need it.
+# Note that Redis will write a pid file in /usr/local/var/run/redis.pid
when daemonized.
+daemonize no
+
+# When running daemonized, Redis writes a pid file in
/usr/local/var/run/redis.pid by
+# default. You can specify a custom pid file location here.
+pidfile /usr/local/var/run/redis.pid
+
+# Accept connections on the specified port, default is 6379.
+# If port 0 is specified Redis will not listen on a TCP socket.
+port 6379
+
+# If you want you can bind a single interface, if the bind option is not
+# specified all the interfaces will listen for incoming connections.
+#
+bind 127.0.0.1
+
+# Specify the path for the unix socket that will be used to listen for
+# incoming connections. There is no default, so Redis will not listen
+# on a unix socket when not specified.
+#
+# unixsocket /tmp/redis.sock
+# unixsocketperm 755
+
+# Close the connection after a client is idle for N seconds (0 to disable)
+timeout 0
+
+# Set server verbosity to 'debug'
+# it can be one of:
+# debug (a lot of information, useful for development/testing)
+# verbose (many rarely useful info, but not a mess like the debug level)
+# notice (moderately verbose, what you want in production probably)
+# warning (only very important / critical messages are logged)
+loglevel verbose
+
+# Specify the log file name. Also 'stdout' can be used to force
+# Redis to log on the standard output. Note that if you use standard
+# output for logging but daemonize, logs will be sent to /dev/null
+logfile stdout
+
+# To enable logging to the system logger, just set 'syslog-enabled' to yes,
+# and optionally update the other syslog parameters to suit your needs.
+# syslog-enabled no
+
+# Specify the syslog identity.
+# syslog-ident redis
+
+# Specify the syslog facility. Must be USER or between LOCAL0-LOCAL7.
+# syslog-facility local0
+
+# Set the number of databases. The default database is DB 0, you can select
+# a different one on a per-connection basis using SELECT <dbid> where
+# dbid is a number between 0 and 'databases'-1
+databases 16
+
+################################ SNAPSHOTTING
#################################
+#
+# Save the DB on disk:
+#
+# save <seconds> <changes>
+#
+# Will save the DB if both the given number of seconds and the given
+# number of write operations against the DB occurred.
+#
+# In the example below the behaviour will be to save:
+# after 900 sec (15 min) if at least 1 key changed
+# after 300 sec (5 min) if at least 10 keys changed
+# after 60 sec if at least 10000 keys changed
+#
+# Note: you can disable saving at all commenting all the "save" lines.
+
+save 900 1
+save 300 10
+save 60 10000
+
+# Compress string objects using LZF when dump .rdb databases?
+# For default that's set to 'yes' as it's almost always a win.
+# If you want to save some CPU in the saving child set it to 'no' but
+# the dataset will likely be bigger if you have compressible values or
keys.
+rdbcompression yes
+
+# The filename where to dump the DB
+dbfilename dump.rdb
+
+# The working directory.
+#
+# The DB will be written inside this directory, with the filename specified
+# above using the 'dbfilename' configuration directive.
+#
+# Also the Append Only File will be created inside this directory.
+#
+# Note that you must specify a directory here, not a file name.
+dir /usr/local/var/db/redis/
+
+################################# REPLICATION
#################################
+
+# Master-Slave replication. Use slaveof to make a Redis instance a copy of
+# another Redis server. Note that the configuration is local to the slave
+# so for example it is possible to configure the slave to save the DB with
a
+# different interval, or to listen to another port, and so on.
+#
+# slaveof <masterip> <masterport>
+
+# If the master is password protected (using the "requirepass"
configuration
+# directive below) it is possible to tell the slave to authenticate before
+# starting the replication synchronization process, otherwise the master
will
+# refuse the slave request.
+#
+# masterauth <master-password>
+
+# When a slave lost the connection with the master, or when the replication
+# is still in progress, the slave can act in two different ways:
+#
+# 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will
+# still reply to client requests, possibly with out of data data, or the
+# data set may just be empty if this is the first synchronization.
+#
+# 2) if slave-serve-stale data is set to 'no' the slave will reply with
+# an error "SYNC with master in progress" to all the kind of commands
+# but to INFO and SLAVEOF.
+#
+slave-serve-stale-data yes
+
+# Slaves send PINGs to server in a predefined interval. It's possible to
change
+# this interval with the repl_ping_slave_period option. The default value
is 10
+# seconds.
+#
+# repl-ping-slave-period 10
+
+# The following option sets a timeout for both Bulk transfer I/O timeout
and
+# master data or ping response timeout. The default value is 60 seconds.
+#
+# It is important to make sure that this value is greater than the value
+# specified for repl-ping-slave-period otherwise a timeout will be detected
+# every time there is low traffic between the master and the slave.
+#
+# repl-timeout 60
+
+################################## SECURITY
###################################
+
+# Require clients to issue AUTH <PASSWORD> before processing any other
+# commands. This might be useful in environments in which you do not trust
+# others with access to the host running redis-server.
+#
+# This should stay commented out for backward compatibility and because
most
+# people do not need auth (e.g. they run their own servers).
+#
+# Warning: since Redis is pretty fast an outside user can try up to
+# 150k passwords per second against a good box. This means that you should
+# use a very strong password otherwise it will be very easy to break.
+#
+# requirepass foobared
+
+# Command renaming.
+#
+# It is possilbe to change the name of dangerous commands in a shared
+# environment. For instance the CONFIG command may be renamed into
something
+# of hard to guess so that it will be still available for internal-use
+# tools but not available for general clients.
+#
+# Example:
+#
+# rename-command CONFIG b840fc02d524045429941cc15f59e41cb7be6c52
+#
+# It is also possilbe to completely kill a command renaming it into
+# an empty string:
+#
+# rename-command CONFIG ""
+
+################################### LIMITS
####################################
+
+# Set the max number of connected clients at the same time. By default
there
+# is no limit, and it's up to the number of file descriptors the Redis
process
+# is able to open. The special value '0' means no limits.
+# Once the limit is reached Redis will close all the new connections
sending
+# an error 'max number of clients reached'.
+#
+# maxclients 128
+
+# Don't use more memory than the specified amount of bytes.
+# When the memory limit is reached Redis will try to remove keys
+# accordingly to the eviction policy selected (see maxmemmory-policy).
+#
+# If Redis can't remove keys according to the policy, or if the policy is
+# set to 'noeviction', Redis will start to reply with errors to commands
+# that would use more memory, like SET, LPUSH, and so on, and will continue
+# to reply to read-only commands like GET.
+#
+# This option is usually useful when using Redis as an LRU cache, or to set
+# an hard memory limit for an instance (using the 'noeviction' policy).
+#
+# WARNING: If you have slaves attached to an instance with maxmemory on,
+# the size of the output buffers needed to feed the slaves are subtracted
+# from the used memory count, so that network problems / resyncs will
+# not trigger a loop where keys are evicted, and in turn the output
+# buffer of slaves is full with DELs of keys evicted triggering the
deletion
+# of more keys, and so forth until the database is completely emptied.
+#
+# In short... if you have slaves attached it is suggested that you set a
lower
+# limit for maxmemory so that there is some free RAM on the system for
slave
+# output buffers (but this is not needed if the policy is 'noeviction').
+#
+# maxmemory <bytes>
+
+# MAXMEMORY POLICY: how Redis will select what to remove when maxmemory
+# is reached? You can select among five behavior:
+#
+# volatile-lru -> remove the key with an expire set using an LRU algorithm
+# allkeys-lru -> remove any key accordingly to the LRU algorithm
+# volatile-random -> remove a random key with an expire set
+# allkeys->random -> remove a random key, any key
+# volatile-ttl -> remove the key with the nearest expire time (minor TTL)
+# noeviction -> don't expire at all, just return an error on write
operations
+#
+# Note: with all the kind of policies, Redis will return an error on write
+# operations, when there are not suitable keys for eviction.
+#
+# At the date of writing this commands are: set setnx setex append
+# incr decr rpush lpush rpushx lpushx linsert lset rpoplpush sadd
+# sinter sinterstore sunion sunionstore sdiff sdiffstore zadd zincrby
+# zunionstore zinterstore hset hsetnx hmset hincrby incrby decrby
+# getset mset msetnx exec sort
+#
+# The default is:
+#
+# maxmemory-policy volatile-lru
+
+# LRU and minimal TTL algorithms are not precise algorithms but
approximated
+# algorithms (in order to save memory), so you can select as well the
sample
+# size to check. For instance for default Redis will check three keys and
+# pick the one that was used less recently, you can change the sample size
+# using the following configuration directive.
+#
+# maxmemory-samples 3
+
+############################## APPEND ONLY MODE
###############################
+
+# By default Redis asynchronously dumps the dataset on disk. If you can
live
+# with the idea that the latest records will be lost if something like a
crash
+# happens this is the preferred way to run Redis. If instead you care a lot
+# about your data and don't want to that a single record can get lost you
should
+# enable the append only mode: when this mode is enabled Redis will append
+# every write operation received in the file appendonly.aof. This file will
+# be read on startup in order to rebuild the full dataset in memory.
+#
+# Note that you can have both the async dumps and the append only file if
you
+# like (you have to comment the "save" statements above to disable the
dumps).
+# Still if append only mode is enabled Redis will load the data from the
+# log file at startup ignoring the dump.rdb file.
+#
+# IMPORTANT: Check the BGREWRITEAOF to check how to rewrite the append
+# log file in background when it gets too big.
+
+appendonly no
+
+# The name of the append only file (default: "appendonly.aof")
+# appendfilename appendonly.aof
+
+# The fsync() call tells the Operating System to actually write data on
disk
+# instead to wait for more data in the output buffer. Some OS will really
flush
+# data on disk, some other OS will just try to do it ASAP.
+#
+# Redis supports three different modes:
+#
+# no: don't fsync, just let the OS flush the data when it wants. Faster.
+# always: fsync after every write to the append only log . Slow, Safest.
+# everysec: fsync only if one second passed since the last fsync.
Compromise.
+#
+# The default is "everysec" that's usually the right compromise between
+# speed and data safety. It's up to you to understand if you can relax
this to
+# "no" that will will let the operating system flush the output buffer when
+# it wants, for better performances (but if you can live with the idea of
+# some data loss consider the default persistence mode that's
snapshotting),
+# or on the contrary, use "always" that's very slow but a bit safer than
+# everysec.
+#
+# If unsure, use "everysec".
+
+# appendfsync always
+appendfsync everysec
+# appendfsync no
+
+# When the AOF fsync policy is set to always or everysec, and a background
+# saving process (a background save or AOF log background rewriting) is
+# performing a lot of I/O against the disk, in some Linux configurations
+# Redis may block too long on the fsync() call. Note that there is no fix
for
+# this currently, as even performing fsync in a different thread will block
+# our synchronous write(2) call.
+#
+# In order to mitigate this problem it's possible to use the following
option
+# that will prevent fsync() from being called in the main process while a
+# BGSAVE or BGREWRITEAOF is in progress.
+#
+# This means that while another child is saving the durability of Redis is
+# the same as "appendfsync none", that in pratical terms means that it is
+# possible to lost up to 30 seconds of log in the worst scenario (with the
+# default Linux settings).
+#
+# If you have latency problems turn this to "yes". Otherwise leave it as
+# "no" that is the safest pick from the point of view of durability.
+no-appendfsync-on-rewrite no
+
+# Automatic rewrite of the append only file.
+# Redis is able to automatically rewrite the log file implicitly calling
+# BGREWRITEAOF when the AOF log size will growth by the specified
percentage.
+#
+# This is how it works: Redis remembers the size of the AOF file after the
+# latest rewrite (or if no rewrite happened since the restart, the size of
+# the AOF at startup is used).
+#
+# This base size is compared to the current size. If the current size is
+# bigger than the specified percentage, the rewrite is triggered. Also
+# you need to specify a minimal size for the AOF file to be rewritten, this
+# is useful to avoid rewriting the AOF file even if the percentage increase
+# is reached but it is still pretty small.
+#
+# Specify a precentage of zero in order to disable the automatic AOF
+# rewrite feature.
+
+auto-aof-rewrite-percentage 100
+auto-aof-rewrite-min-size 64mb
+
+################################## SLOW LOG
###################################
+
+# The Redis Slow Log is a system to log queries that exceeded a specified
+# execution time. The execution time does not include the I/O operations
+# like talking with the client, sending the reply and so forth,
+# but just the time needed to actually execute the command (this is the
only
+# stage of command execution where the thread is blocked and can not serve
+# other requests in the meantime).
+#
+# You can configure the slow log with two parameters: one tells Redis
+# what is the execution time, in microseconds, to exceed in order for the
+# command to get logged, and the other parameter is the length of the
+# slow log. When a new command is logged the oldest one is removed from the
+# queue of logged commands.
+
+# The following time is expressed in microseconds, so 1000000 is equivalent
+# to one second. Note that a negative number disables the slow log, while
+# a value of zero forces the logging of every command.
+slowlog-log-slower-than 10000
+
+# There is no limit to this length. Just be aware that it will consume
memory.
+# You can reclaim memory used by the slow log with SLOWLOG RESET.
+slowlog-max-len 128
+
+################################ VIRTUAL MEMORY
###############################
+
+### WARNING! Virtual Memory is deprecated in Redis 2.4
+### The use of Virtual Memory is strongly discouraged.
+
+# Virtual Memory allows Redis to work with datasets bigger than the actual
+# amount of RAM needed to hold the whole dataset in memory.
+# In order to do so very used keys are taken in memory while the other keys
+# are swapped into a swap file, similarly to what operating systems do
+# with memory pages.
+#
+# To enable VM just set 'vm-enabled' to yes, and set the following three
+# VM parameters accordingly to your needs.
+
+vm-enabled no
+# vm-enabled yes
+
+# This is the path of the Redis swap file. As you can guess, swap files
+# can't be shared by different Redis instances, so make sure to use a swap
+# file for every redis process you are running. Redis will complain if the
+# swap file is already in use.
+#
+# The best kind of storage for the Redis swap file (that's accessed at
random)
+# is a Solid State Disk (SSD).
+#
+# *** WARNING *** if you are using a shared hosting the default of putting
+# the swap file under /tmp is not secure. Create a dir with access granted
+# only to Redis user and configure Redis to create the swap file there.
+vm-swap-file /tmp/redis.swap
+
+# vm-max-memory configures the VM to use at max the specified amount of
+# RAM. Everything that deos not fit will be swapped on disk *if* possible,
that
+# is, if there is still enough contiguous space in the swap file.
+#
+# With vm-max-memory 0 the system will swap everything it can. Not a good
+# default, just specify the max amount of RAM you can in bytes, but it's
+# better to leave some margin. For instance specify an amount of RAM
+# that's more or less between 60 and 80% of your free RAM.
+vm-max-memory 0
+
+# Redis swap files is split into pages. An object can be saved using
multiple
+# contiguous pages, but pages can't be shared between different objects.
+# So if your page is too big, small objects swapped out on disk will waste
+# a lot of space. If you page is too small, there is less space in the swap
+# file (assuming you configured the same number of total swap file pages).
+#
+# If you use a lot of small objects, use a page size of 64 or 32 bytes.
+# If you use a lot of big objects, use a bigger page size.
+# If unsure, use the default :)
+vm-page-size 32
+
+# Number of total memory pages in the swap file.
+# Given that the page table (a bitmap of free/used pages) is taken in
memory,
+# every 8 pages on disk will consume 1 byte of RAM.
+#
+# The total swap size is vm-page-size * vm-pages
+#
+# With the default of 32-bytes memory pages and 134217728 pages Redis will
+# use a 4 GB swap file, that will use 16 MB of RAM for the page table.
+#
+# It's better to use the smallest acceptable value for your application,
+# but the default is large in order to work in most conditions.
+vm-pages 134217728
+
+# Max number of VM I/O threads running at the same time.
+# This threads are used to read/write data from/to swap file, since they
+# also encode and decode objects from disk to memory or the reverse, a
bigger
+# number of threads can help with big objects even if they can't help with
+# I/O itself as the physical device may not be able to couple with many
+# reads/writes operations at the same time.
+#
+# The special value of 0 turn off threaded I/O and enables the blocking
+# Virtual Memory implementation.
+vm-max-threads 4
+
+############################### ADVANCED CONFIG
###############################
+
+# Hashes are encoded in a special way (much more memory efficient) when
they
+# have at max a given numer of elements, and the biggest element does not
+# exceed a given threshold. You can configure this limits with the
following
+# configuration directives.
+hash-max-zipmap-entries 512
+hash-max-zipmap-value 64
+
+# Similarly to hashes, small lists are also encoded in a special way in
order
+# to save a lot of space. The special representation is only used when
+# you are under the following limits:
+list-max-ziplist-entries 512
+list-max-ziplist-value 64
+
+# Sets have a special encoding in just one case: when a set is composed
+# of just strings that happens to be integers in radix 10 in the range
+# of 64 bit signed integers.
+# The following configuration setting sets the limit in the size of the
+# set in order to use this special memory saving encoding.
+set-max-intset-entries 512
+
+# Similarly to hashes and lists, sorted sets are also specially encoded in
+# order to save a lot of space. This encoding is only used when the length
and
+# elements of a sorted set are below the following limits:
+zset-max-ziplist-entries 128
+zset-max-ziplist-value 64
+
+# Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in
+# order to help rehashing the main Redis hash table (the one mapping
top-level
+# keys to values). The hash table implementation redis uses (see dict.c)
+# performs a lazy rehashing: the more operation you run into an hash table
+# that is rhashing, the more rehashing "steps" are performed, so if the
+# server is idle the rehashing is never complete and some more memory is
used
+# by the hash table.
+#
+# The default is to use this millisecond 10 times every second in order to
+# active rehashing the main dictionaries, freeing memory when possible.
+#
+# If unsure:
+# use "activerehashing no" if you have hard latency requirements and it is
+# not a good thing in your environment that Redis can reply form time to
time
+# to queries with 2 milliseconds delay.
+#
+# use "activerehashing yes" if you don't have such hard requirements but
+# want to free memory asap when possible.
+activerehashing yes
+
+################################## INCLUDES
###################################
+
+# Include one or more other config files here. This is useful if you
+# have a standard template that goes to all redis server but also need
+# to customize a few per-server settings. Include files can include
+# other files, so use this wisely.
+#
+# include /path/to/local.conf
+# include /path/to/other.conf
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/lib/jredis-core-ri-a.0-SNAPSHOT-jar-with-dependencies.jar
Fri Jun 22 02:46:58 2012
Binary file, no diff available.
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/lib/junit-4.8.2.jar Fri Jun 22 02:46:58 2012
Binary file, no diff available.
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/lib/lucene-analyzers-common-4.0-SNAPSHOT.jar
Fri Jun 22 02:46:58 2012
File is too large to display a diff.
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/lib/lucene-core-4.0-SNAPSHOT.jar Fri Jun 22
02:46:58 2012
File is too large to display a diff.
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/oldsrc/org/apache/lucene/index/UpdatingIndexWriter.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,123 @@
+package org.apache.lucene.index;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.store.CompoundFileDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import uk.co.flax.rediscodec.Diff;
+
+import java.io.IOException;
+import java.util.BitSet;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class UpdatingIndexWriter extends IndexWriter {
+ /**
+ * Constructs a new IndexWriter per the settings given in
<code>conf</code>.
+ * Note that the passed in {@link
org.apache.lucene.index.IndexWriterConfig} is
+ * privately cloned; if you need to make subsequent "live"
+ * changes to the configuration use {@link #getConfig}.
+ * 
+ *
+ * @param d the index directory. The index is either created or
appended
+ * according <code>conf.getOpenMode()</code>.
+ * @param conf the configuration settings according to which
IndexWriter should
+ * be initialized.
+ * @throws org.apache.lucene.index.CorruptIndexException
+ * if the index is corrupt
+ * @throws org.apache.lucene.store.LockObtainFailedException
+ * if another writer has this index open
(<code>write.lock</code>
+ * could not be obtained)
+ * @throws java.io.IOException if the directory cannot be read/written
to, or if it does not
+ * exist and
<code>conf.getOpenMode()</code> is
+ * <code>OpenMode.APPEND</code> or if
there is any other low-level
+ * IO error
+ */
+ public UpdatingIndexWriter(Directory d, IndexWriterConfig conf) throws
IOException {
+ super(d, conf);
+ }
+
+ public void updateByQuery(Query query, Diff diff) throws IOException {
+
+ Directory directory = getReader().directory();
+
+ for (SegmentInfoPerCommit si : this.segmentInfos) {
+ SegmentReader reader = new SegmentReader(si, -1, new
IOContext());
+ BitSet docsToUpdate = getMatchingDocs(query, reader);
+
+ FieldInfos fis = getFieldInfos(si.info);
+
+ SegmentWriteState writeState = new
SegmentWriteState(this.infoStream, directory, si.info, fis, -1, null,
IOContext.DEFAULT);
+ FieldsConsumer consumer =
si.info.getCodec().postingsFormat().fieldsConsumer(writeState);
+
+ MergeState mergeState = new MergeState();
+ mergeState.fieldInfos = fis;
+ mergeState.segmentInfo = si.info;
+ consumer.merge(mergeState, reader.fields());
+ }
+
+ }
+
+ private static BitSet getMatchingDocs(Query query, SegmentReader
reader) throws IOException {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ final BitSet bits = new BitSet(reader.maxDoc());
+ searcher.search(query, new Collector() {
+ private int docBase;
+
+ public void setScorer(Scorer scorer) {
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+
+ public void collect(int doc) {
+ bits.set(doc + docBase);
+ }
+
+ public void setNextReader(AtomicReaderContext context) {
+ this.docBase = context.docBase;
+ }
+ });
+ return bits;
+ }
+
+ // Shamelessly copied from parent...
+ private FieldInfos getFieldInfos(SegmentInfo info) throws IOException {
+ Directory cfsDir = null;
+ try {
+ if (info.getUseCompoundFile()) {
+ cfsDir = new CompoundFileDirectory(info.dir,
+ IndexFileNames.segmentFileName(info.name, "",
IndexFileNames.COMPOUND_FILE_EXTENSION),
+ IOContext.READONCE,
+ false);
+ } else {
+ cfsDir = info.dir;
+ }
+ return
info.getCodec().fieldInfosFormat().getFieldInfosReader().read(cfsDir,
+ info.name,
+ IOContext.READONCE);
+ } finally {
+ if (info.getUseCompoundFile() && cfsDir != null) {
+ cfsDir.close();
+ }
+ }
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/Example.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,141 @@
+package uk.co.flax.externalcodec;
+
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.SimpleFSDirectory;
+import org.apache.lucene.util.Version;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.BitSet;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class Example {
+
+ /*
+ So here's what we do:
+
+ - Add a whole bunch of documents, occasionally forcing segment writes,
so
+ that we have a number of segments.
+
+ - updateByQuery(Directory dir, Query query, Diff diff)
+ - open dir, read segments
+ - for each segment:
+ - run query and get list of docids
+ - create a new mergepolicy with the list of docids and diffs,
and set it
+ on the directory
+ - forceMerge()
+
+ UpdateFieldMergePolicy(docids, diffs)
+ findMerges -> return a series of OneMerge objects combining the
actual segment
+ with a faked SegmentInfoPerCommit containing the Diff
+
+
+ FieldsConsumer.merge()
+ FieldsConsumer.addField() -> overridden in UpdatingCodec
+
+ */
+
+ public static final String indexDirectory = "index/";
+
+ public static void main(String[] args) throws IOException {
+ Directory dir = new SimpleFSDirectory(new File(indexDirectory));
+ //writeExampleIndex(dir);
+ Query q = new TermQuery(new Term("tag", "exampletag"));
+ Diff diff = new Diff();
+ //querySegments(q, dir);
+ //runFakeMerge(dir, q, diff);
+ runUpdatingIndexWriter(dir, q, diff);
+ }
+
+ static void runUpdatingIndexWriter(Directory dir, Query query, Diff
diff) throws IOException {
+ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40,
new KeywordAnalyzer());
+ iwc.setCodec(Codec.forName("SimpleText"));
+ UpdatingIndexWriter iw = new UpdatingIndexWriter(dir, iwc);
+ iw.updateByQuery(query, diff);
+ }
+
+ static void runFakeMerge(Directory dir, Query query, Diff diff) throws
IOException {
+ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40,
new KeywordAnalyzer());
+ iwc.setCodec(Codec.forName("SimpleText"));
+ iwc.setMergePolicy(new UpdatingCodecMergePolicy(query, diff));
+
+ IndexWriter writer = new IndexWriter(dir, iwc);
+ writer.maybeMerge();
+ }
+
+ static void querySegments(Query query, Directory dir) throws
IOException {
+ CompositeReader reader = DirectoryReader.open(dir);
+ for (IndexReader ir : reader.getSequentialSubReaders()) {
+ IndexSearcher searcher = new IndexSearcher(ir);
+ final BitSet bits = new BitSet(ir.maxDoc());
+ searcher.search(query, new Collector() {
+ private int docBase;
+
+ public void setScorer(Scorer scorer) {
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+
+ public void collect(int doc) {
+ bits.set(doc + docBase);
+ }
+
+ public void setNextReader(AtomicReaderContext context) {
+ this.docBase = context.docBase;
+ }
+ });
+ dump("Matching docs", bits);
+ }
+ }
+
+ static void dump(String message, BitSet bits) {
+ System.out.println(message);
+ for (int i = bits.nextSetBit(0); i >= 0; i = bits.nextSetBit(i +
1)) {
+ System.out.println(i);
+ }
+ }
+
+ static void writeExampleIndex(Directory dir) throws IOException {
+
+ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40,
new KeywordAnalyzer());
+ iwc.setCodec(Codec.forName("SimpleText"));
+
+ IndexWriter writer = new IndexWriter(dir, iwc);
+
+ FieldType ft = new FieldType();
+ ft.setIndexed(true);
+ for (int i = 20; i < 40; i++) {
+ Document doc = new Document();
+ doc.add(new Field("id", Integer.toString(i), ft));
+ doc.add(new Field("tag", "exampletag", ft));
+ writer.addDocument(doc);
+ }
+
+ writer.close();
+
+ }
+
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/ExternalCodec.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,35 @@
+package uk.co.flax.externalcodec;
+
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.lucene40.Lucene40Codec;
+import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class ExternalCodec extends Lucene40Codec {
+
+ private PostingsFormat stdPostingsFormat = new
Lucene40PostingsFormat();
+ private PostingsFormat externalPostingsFormat = new
ExternalPostingsFormat();
+
+ @Override
+ public PostingsFormat getPostingsFormatForField(String field) {
+ if (field == "external") {
+ return externalPostingsFormat;
+ }
+ return stdPostingsFormat;
+ }
+
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/ExternalPostingsFormat.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,41 @@
+package uk.co.flax.externalcodec;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+import java.io.IOException;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class ExternalPostingsFormat extends PostingsFormat {
+
+ protected ExternalPostingsFormat() {
+ super("ExternalField");
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws
IOException {
+ return null; //To change body of implemented methods use File |
Settings | File Templates.
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws
IOException {
+ return null; //To change body of implemented methods use File |
Settings | File Templates.
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/oldsrc/uk/co/flax/externalcodec/UpdatingCodecMergePolicy.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,105 @@
+package uk.co.flax.externalcodec;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.store.IOContext;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class UpdatingCodecMergePolicy extends MergePolicy {
+
+ private final Query query;
+ private final Diff diff;
+
+ public UpdatingCodecMergePolicy(Query query, Diff diff) {
+ this.query = query;
+ this.diff = diff;
+ }
+
+ @Override
+ public MergeSpecification findMerges(SegmentInfos segmentInfos) throws
CorruptIndexException, IOException {
+ // So the idea here is to create fake merge specs for each segment
that has
+ // a corresponding query match.
+ MergeSpecification spec = new MergeSpecification();
+
+ for (SegmentInfoPerCommit si : segmentInfos.asList()) {
+ SegmentReader reader = new SegmentReader(si, -1, new
IOContext());
+ BitSet docsToUpdate = getMatchingDocs(reader);
+ List<SegmentInfoPerCommit> commits = new
ArrayList<SegmentInfoPerCommit>();
+ commits.add(si);
+ commits.add(new SegmentDiff(diff, docsToUpdate));
+ spec.add(new OneMerge(commits));
+ }
+
+ return spec;
+ }
+
+ private class SegmentDiff {
+ public SegmentDiff(Diff diff, BitSet bits) {}
+ }
+
+ private BitSet getMatchingDocs(SegmentReader reader) throws
IOException {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ final BitSet bits = new BitSet(reader.maxDoc());
+ searcher.search(query, new Collector() {
+ private int docBase;
+
+ public void setScorer(Scorer scorer) {
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+
+ public void collect(int doc) {
+ bits.set(doc + docBase);
+ }
+
+ public void setNextReader(AtomicReaderContext context) {
+ this.docBase = context.docBase;
+ }
+ });
+ return bits;
+ }
+
+ @Override
+ public MergeSpecification findForcedMerges(SegmentInfos segmentInfos,
int maxSegmentCount, Map<SegmentInfoPerCommit, Boolean> segmentsToMerge)
throws CorruptIndexException, IOException {
+ return findMerges(segmentInfos);
+ }
+
+ @Override
+ public MergeSpecification findForcedDeletesMerges(SegmentInfos
segmentInfos) throws CorruptIndexException, IOException {
+ return findMerges(segmentInfos);
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public boolean useCompoundFile(SegmentInfos segments,
SegmentInfoPerCommit newSegment) throws IOException {
+ return false;
+ }
+}
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/Diff.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,43 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.index.Term;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class Diff {
+
+ private List<Term> add = new ArrayList<Term>();
+ private List<Term> delete = new ArrayList<Term>();
+
+ public void addTerm(Term term) {
+ add.add(term);
+ }
+
+ public void deleteTerm(Term term) {
+ delete.add(term);
+ }
+
+ public List<Term> getAdds() {
+ return add;
+ }
+
+ public List<Term> getDeletes() {
+ return delete;
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingFieldsConsumer.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,45 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.TermsConsumer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.jredis.ri.alphazero.JRedisClient;
+
+import java.io.IOException;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class RedisUpdatingFieldsConsumer extends FieldsConsumer {
+
+ private String segmentName;
+ private JRedisClient redis;
+
+ public RedisUpdatingFieldsConsumer(SegmentWriteState state) {
+ segmentName = state.segmentInfo.name;
+ redis = new JRedisClient(); // TODO: host, port, etc!
+ }
+
+ @Override
+ public TermsConsumer addField(FieldInfo field) throws IOException {
+ return new RedisUpdatingTermsWriter(redis, segmentName);
+ }
+
+ @Override
+ public void close() throws IOException {
+ //To change body of implemented methods use File | Settings | File
Templates.
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingFieldsProducer.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,248 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.index.*;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.jredis.RedisException;
+import org.jredis.ri.alphazero.JRedisClient;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class RedisUpdatingFieldsProducer extends FieldsProducer {
+
+ private JRedisClient redis;
+ private String segment;
+
+ public RedisUpdatingFieldsProducer(SegmentReadState state) {
+ this.segment = state.segmentInfo.name;
+ this.redis = new JRedisClient(); // TODO: host, port, etc
+ }
+
+ @Override
+ public void close() throws IOException {
+ //To change body of implemented methods use File | Settings | File
Templates.
+ }
+
+ @Override
+ public FieldsEnum iterator() throws IOException {
+ return new FieldsEnum() {
+
+ boolean done = false;
+
+ @Override
+ public String next() throws IOException {
+ if (!done) {
+ done = true;
+ return "tag";
+ }
+ return null;
+ }
+
+ @Override
+ public Terms terms() throws IOException {
+ return RedisUpdatingFieldsProducer.this.terms("tag");
+ }
+ };
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ if (field != "tag")
+ return null;
+ return new RedisUpdatingTerms();
+ }
+
+ @Override
+ public int size() throws IOException {
+ return -1;
+ }
+
+ public class RedisUpdatingTerms extends Terms {
+
+ @Override
+ public TermsEnum iterator(TermsEnum reuse) throws IOException {
+ return new RedisUpdatingTermsEnum();
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() throws IOException {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+
+ @Override
+ public long size() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public long getSumDocFreq() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public int getDocCount() throws IOException {
+ return -1;
+ }
+
+ }
+
+ private class RedisUpdatingTermsEnum extends TermsEnum {
+
+ private int[] docs;
+ private BytesRef term;
+
+ @Override
+ public SeekStatus seekCeil(BytesRef text, boolean useCache) throws
IOException {
+ try {
+ String key = segment + "_" + text.utf8ToString();
+ if (!redis.exists(key)) {
+ return SeekStatus.END; // todo iteration
+ }
+ byte[] data = redis.get(key);
+ docs = new int[data.length / 4];
+ ByteBuffer.wrap(data).asIntBuffer().get(docs);
+ term = text.clone();
+ return SeekStatus.FOUND;
+
+ } catch (RedisException e) {
+ throw new IOException(e);
+ }
+ }
+
+ @Override
+ public void seekExact(long ord) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public BytesRef term() throws IOException {
+ return term;
+ }
+
+ @Override
+ public long ord() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ return docs.length;
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean
needsFreqs) throws IOException {
+ if (needsFreqs)
+ return null;
+ return new RedisUpdatingDocsAndPositionsEnum(liveDocs, docs);
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
+ if (needsOffsets)
+ return null;
+ return new RedisUpdatingDocsAndPositionsEnum(liveDocs, docs);
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ return null;
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+ }
+
+ private class RedisUpdatingDocsAndPositionsEnum extends
DocsAndPositionsEnum {
+
+ int[] docs;
+ int current;
+
+ public RedisUpdatingDocsAndPositionsEnum(Bits liveDocs, int[]
docs) {
+ this.docs = docs;
+ this.current = -1;
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ return null;
+ }
+
+ @Override
+ public boolean hasPayload() {
+ return false;
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return -1;
+ }
+
+ @Override
+ public int docID() {
+ if (current == -1 || current >= docs.length)
+ return NO_MORE_DOCS;
+ return docs[current];
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ if (++current >= docs.length)
+ return NO_MORE_DOCS;
+ return docs[current];
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ current = Arrays.binarySearch(docs, target);
+ if (current < 0)
+ current = -(current + 1);
+ return docID();
+ }
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingPostingsFormat.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,41 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+import java.io.IOException;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class RedisUpdatingPostingsFormat extends PostingsFormat {
+
+ public RedisUpdatingPostingsFormat() {
+ super("ExternalRedisCodec");
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws
IOException {
+ return new RedisUpdatingFieldsConsumer(state);
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws
IOException {
+ return new RedisUpdatingFieldsProducer(state);
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingPostingsWriter.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,73 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class RedisUpdatingPostingsWriter extends PostingsConsumer {
+
+ private String segment;
+ private String term;
+ private IntsRef docs = new IntsRef(1024);
+ private int doccount = 0;
+
+ public RedisUpdatingPostingsWriter(String segmentname) {
+ this.segment = segmentname;
+ }
+
+ @Override
+ public void startDoc(int docID, int freq) throws IOException {
+ // TODO: store frequencies as well?
+ docs.ints[doccount] = docID;
+ if (++doccount > docs.length) {
+ docs.grow(docs.length * 2);
+ }
+ }
+
+ @Override
+ public void addPosition(int position, BytesRef payload, int
startOffset, int endOffset) throws IOException {
+ //To change body of implemented methods use File | Settings | File
Templates.
+ }
+
+ @Override
+ public void finishDoc() throws IOException {
+ //To change body of implemented methods use File | Settings | File
Templates.
+ }
+
+ public PostingsConsumer setTerm(BytesRef text) {
+ this.term = text.utf8ToString();
+ this.docs.length = 0;
+ this.doccount = 0;
+ return this;
+ }
+
+ public String getKey() {
+ return segment + "_" + term;
+ }
+
+ public byte[] getPostings() {
+ ByteBuffer bytes = ByteBuffer.allocate(doccount * 4);
+ IntBuffer ib = bytes.asIntBuffer();
+ ib.put(docs.ints, 0, doccount);
+ return bytes.array();
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/RedisUpdatingTermsWriter.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,63 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.TermsConsumer;
+import org.apache.lucene.util.BytesRef;
+import org.jredis.RedisException;
+import org.jredis.ri.alphazero.JRedisClient;
+
+import java.io.IOException;
+import java.util.Comparator;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class RedisUpdatingTermsWriter extends TermsConsumer {
+
+ private RedisUpdatingPostingsWriter writer;
+ private JRedisClient redis;
+
+ public RedisUpdatingTermsWriter(JRedisClient redis, String
segmentname) {
+ this.redis = redis;
+ this.writer = new RedisUpdatingPostingsWriter(segmentname);
+ }
+
+ @Override
+ public PostingsConsumer startTerm(BytesRef text) throws IOException {
+ return writer.setTerm(text);
+ }
+
+ @Override
+ public void finishTerm(BytesRef text, TermStats stats) throws
IOException {
+ // Write term to redis!
+ try {
+ redis.set(writer.getKey(), writer.getPostings());
+ } catch (RedisException e) {
+ throw new IOException(e);
+ }
+
+ }
+
+ @Override
+ public void finish(long sumTotalTermFreq, long sumDocFreq, int
docCount) throws IOException {
+ //To change body of implemented methods use File | Settings | File
Templates.
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() throws IOException {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+}
=======================================
--- /dev/null
+++ /trunk/LuceneRedisCodec/poc/src/java/uk/co/flax/rediscodec/Updater.java
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,154 @@
+package uk.co.flax.rediscodec;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.jredis.RedisException;
+import org.jredis.ri.alphazero.JRedisClient;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
+import java.util.Arrays;
+import java.util.BitSet;
+
+/**
+ * Copyright (c) 2012 Lemur Consulting Ltd.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+public class Updater {
+
+ public static void updateByQuery(Directory dir, Query updatequery,
Diff diff) throws IOException {
+
+ // Do the segment dance - for each segment, run the query and get
the relevant
+ // docids to update.
+ SegmentInfos segments = new SegmentInfos();
+ segments.read(dir);
+
+ for (SegmentInfoPerCommit si : segments) {
+ SegmentReader reader = new SegmentReader(si, 1, new
IOContext());
+ BitSet docsToUpdate = getMatchingDocs(updatequery, reader);
+ updateSegment(si.info.name, docsToUpdate, diff);
+ }
+
+ }
+
+ private static void updateSegment(String segment, BitSet docs, Diff
diff) throws IOException {
+ JRedisClient redis = new JRedisClient();
+ try {
+ for (Term add : diff.getAdds()) {
+ String key = segment + "_" + add.text();
+ if (!redis.exists(key)) {
+ // New key - just add it with the current bitset
+ //int[] docset = new int[docs.cardinality()];
+ ByteBuffer bytes =
ByteBuffer.allocate(docs.cardinality() * 4);
+ IntBuffer docset = bytes.asIntBuffer();
+ for (int i = docs.nextSetBit(0), j = 0; i >= 0; i =
docs.nextSetBit(i + 1), j++) {
+ docset.put(j, i);
+ }
+ redis.set(key, bytes.array());
+ }
+ else {
+ byte[] orig = redis.get(key);
+ int[] origpostings = new int[orig.length / 4];
+ int[] newpostings = new int[origpostings.length +
docs.cardinality()];
+ ByteBuffer.wrap(orig).asIntBuffer().get(origpostings);
+ //ByteBuffer buffer = ByteBuffer.allocate(orig.length
+ docs.cardinality() * 4);
+ //IntBuffer postings = buffer.asIntBuffer();
+ int spos = 0, dpos = 0, ndoc = -1;
+ while ((ndoc = docs.nextSetBit(ndoc + 1)) >= 0) {
+ if (spos >= origpostings.length) {
+ newpostings[dpos++] = ndoc;
+ }
+ else {
+ int upto = Arrays.binarySearch(origpostings,
ndoc);
+ if (upto < 0) {
+ upto = -(upto + 1);
+ System.arraycopy(origpostings, spos,
newpostings, dpos, upto - spos);
+ dpos += upto - spos;
+ spos = upto;
+ newpostings[dpos++] = ndoc;
+ }
+ else {
+ // We already exist in this document, so
just copy the old stuff up
+ System.arraycopy(origpostings, spos,
newpostings, dpos, upto - spos);
+ dpos += upto - spos;
+ spos = upto;
+ }
+ }
+ }
+ ByteBuffer bb = ByteBuffer.allocate(newpostings.length
* 4);
+ bb.asIntBuffer().put(newpostings);
+ redis.set(key, bb.array());
+ }
+ }
+ for (Term del : diff.getDeletes()) {
+ String key = segment + "_" + del.text();
+ if (!redis.exists(key)) {
+ continue;
+ }
+ byte[] orig = redis.get(key);
+ int[] origpostings = new int[orig.length / 4];
+ int[] newpostings = new int[origpostings.length -
docs.cardinality()];
+ ByteBuffer.wrap(orig).asIntBuffer().get(origpostings);
+
+ int spos = 0, dpos = 0, ndoc = -1;
+ while ((ndoc = docs.nextSetBit(ndoc + 1)) >= 0) {
+ if (spos >= origpostings.length)
+ break;
+ while (origpostings[spos++] < ndoc) {
+ newpostings[dpos++] = origpostings[spos];
+ }
+ spos++;
+ }
+ if (spos < origpostings.length) {
+ System.arraycopy(origpostings, spos, newpostings,
dpos, origpostings.length - spos);
+ }
+ ByteBuffer bb = ByteBuffer.allocate(newpostings.length *
4);
+ bb.asIntBuffer().put(newpostings);
+ redis.set(key, bb.array());
+ }
+ } catch (RedisException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private static BitSet getMatchingDocs(Query query, SegmentReader
reader) throws IOException {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ final BitSet bits = new BitSet(reader.maxDoc());
+ searcher.search(query, new Collector() {
+ private int docBase;
+
+ public void setScorer(Scorer scorer) {
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+
+ public void collect(int doc) {
+ bits.set(doc + docBase);
+ }
+
+ public void setNextReader(AtomicReaderContext context) {
+ this.docBase = context.docBase;
+ }
+ });
+ return bits;
+ }
+}
=======================================
--- /dev/null
+++
/trunk/LuceneRedisCodec/poc/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
Fri Jun 22 02:46:58 2012
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+uk.co.flax.rediscodec.RedisUpdatingPostingsFormat
=======================================
***Additional files exist in this changeset.***

Reply all

Reply to author

Forward

0 new messages