### Eclipse Workspace Patch 1.0
#P dspace
Index: src/org/dspace/search/DSIndexer.java
===================================================================
RCS file: /cvsroot/dspace/dspace/src/org/dspace/search/DSIndexer.java,v
retrieving revision 1.41
diff -u -r1.41 DSIndexer.java
--- src/org/dspace/search/DSIndexer.java	3 Nov 2006 05:01:31 -0000	1.41
+++ src/org/dspace/search/DSIndexer.java	13 Dec 2006 18:12:29 -0000
@@ -39,13 +39,17 @@
  */
 package org.dspace.search;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.sql.SQLException;
 import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.PosixParser;
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
@@ -62,7 +66,6 @@
 import org.dspace.content.DSpaceObject;
 import org.dspace.content.Item;
 import org.dspace.content.ItemIterator;
-import org.dspace.content.MetadataSchema;
 import org.dspace.core.ConfigurationManager;
 import org.dspace.core.Constants;
 import org.dspace.core.Context;
@@ -82,7 +85,7 @@
     // TODO: Support for analyzers per language, or multiple indices
     /** The analyzer for this DSpace instance */
     private static Analyzer analyzer = null;
-    
+
     /**
      * IndexItem() adds a single item to the index
      */
@@ -122,7 +125,7 @@
     /**
      * unIndex removes an Item, Collection, or Community only works if the
      * DSpaceObject has a handle (uses the handle for its unique ID)
-     * 
+     *
      * @param dso
      *            DSpace Object, can be Community, Item, or Collection
      */
@@ -165,7 +168,7 @@
 
     /**
      * reIndexContent removes something from the index, then re-indexes it
-     * 
+     *
      * @param c context object
      * @param dso  object to re-index
      */
@@ -178,7 +181,7 @@
 
     /**
      * create full index - wiping old index
-     * 
+     *
      * @param c   context to use
      */
     public static void createIndex(Context c) throws SQLException, IOException
@@ -190,7 +193,83 @@
             indexAllCommunities(c, writer);
             indexAllCollections(c, writer);
             indexAllItems(c, writer);
+        }
+        finally
+        {
+            closeIndex(c, writer);
+        }
+    }
+
+    public static void updateIndex(Context c, boolean test) throws SQLException, IOException
+    {
+
+    File index_dir = new File(System.getProperty("java.io.tmpdir"), System.currentTimeMillis() + "index");
+
+    IndexWriter writer = openIndex(c, index_dir, true);
+
+        try
+        {
+            indexAllCommunities(c, writer);
+            indexAllCollections(c, writer);
+            indexAllItems(c, writer);
+
+        }
+        finally
+        {
+            closeIndex(c, writer);
+        }
+
+        if(!test)
+        {
+            IndexReader reader = IndexReader.open(index_dir);
+
+            try
+            {
+                writer = openIndex(c, true);
+                writer.addIndexes( new IndexReader[]{ reader } );
+            }
+            finally
+            {
+                closeIndex(c, writer);
+                reader.close();
+                recursiveDelete(index_dir);
+            }
+        }
+
+
 
+    }
+
+    /**
+     * Recursively deletes contents of a directory
+     * 
+     * (Note: Ant libraries do this, would be nice to have Ant as a tool in DSpace runtime)
+     * @param directory Directory to Delete.
+     */
+    private static void recursiveDelete(File directory){
+    File[] children = directory.listFiles();
+    for(int i = 0 ; i < children.length ; i++){
+        if(children[i].isFile())
+        children[i].delete();
+        else if (children[i].isDirectory())
+        recursiveDelete(children[i]);
+    }
+    directory.delete();
+    }
+    
+    /**
+     * Optimize the existing index.
+     * 
+     * @param c Users Context
+     * @throws SQLException
+     * @throws IOException
+     */
+    public static void optimizeIndex(Context c) throws SQLException, IOException
+    {
+    IndexWriter writer = openIndex(c, false);
+
+        try
+        {
             // optimize the index - important to do regularly to reduce
             // filehandle
             // usage
@@ -205,34 +284,76 @@
 
     /**
      * When invoked as a command-line tool, (re)-builds the whole index
-     * 
+     *
      * @param args
      *            the command-line arguments, none used
      */
     public static void main(String[] args) throws Exception
     {
+    // create an options object and populate it
+        CommandLineParser parser = new PosixParser();
+
+        Options options = new Options();
+    options
+        .addOption("r", "remove", true,
+            "-r <item handle> : remove an item handle from index based on its handle");
+    options.addOption("o", "optimize", false, "optimize existing index");
+    options.addOption("c", "create", false,
+        "build new index in search directory destroying existing index first");
+    options.addOption("u", "update", false,
+        "build new index in tmp directory and replace search directory when complete");
+
+    options
+        .addOption(
+            "t",
+            "test",
+            false,
+            "If used in conjunctionwith 'u' will only build new index, and will not replace old with new at end");
+
+    options.addOption("h", "help", false, "Print this help message");
+
+        CommandLine line = parser.parse(options, args);
+
+        if(!(line.hasOption("r") || line.hasOption("o") || line.hasOption("c") || line.hasOption("u")))
+        {
+            // automatically generate the help statement
+            HelpFormatter formatter = new HelpFormatter();
+            formatter.printHelp( "DSIndexer", options );
+            System.exit(1);
+        }
+
         Context c = new Context();
+        c.setIgnoreAuthorization(true);
+
 
         // for testing, pass in a handle of something to remove...
-        if ((args.length == 2) && (args[0].equals("remove")))
-        {
-            unIndexContent(c, args[1]);
+        if (line.hasOption("r")){
+            log.info("Removing " + line.getOptionValue("r") + " from Index");
+            unIndexContent(c, line.getOptionValue("r"));
+        } else if(line.hasOption("u")){
+            log.info("Updating Existing Index");
+            updateIndex(c, line.hasOption("t"));
         }
-        else
+        else if(line.hasOption("c"))
         {
-            c.setIgnoreAuthorization(true);
-
+            log.info("Creating New Index");
             createIndex(c);
+        }
 
-            System.out.println("Done with indexing");
+        if (line.hasOption("o"))
+        {
+            log.info("Optimizing Index");
+            optimizeIndex(c);
         }
+
+        log.info("Done with indexing");
     }
 
     /**
      * Get the Lucene analyzer to use according to current configuration (or
      * default). TODO: Should have multiple analyzers (and maybe indices?) for
      * multi-lingual DSpaces.
-     * 
+     *
      * @return <code>Analyzer</code> to use
      * @throws IllegalStateException
      *             if the configured analyzer can't be instantiated
@@ -267,23 +388,35 @@
 
         return analyzer;
     }
-    
-    
+
+
     ////////////////////////////////////
     //      Private
     ////////////////////////////////////
 
     /**
-     * prepare index, opening writer, and wiping out existing index if necessary
+     * prepare exposed index, opening writer, and wiping out existing index if necessary
      */
     private static IndexWriter openIndex(Context c, boolean wipe_existing)
             throws IOException
     {
-        IndexWriter writer;
-
         String index_directory = ConfigurationManager.getProperty("search.dir");
+        return openIndex(c, new File(index_directory), wipe_existing);
+    }
+
+    /**
+     * prepare index in specidified location, opening writer, and wiping out existing index if necessary
+     */
+    private static IndexWriter openIndex(Context c, File index_dir, boolean wipe_existing)
+            throws IOException
+    {
 
-        writer = new IndexWriter(index_directory, getAnalyzer(),
+        if(!index_dir.exists())
+        {
+            index_dir.mkdirs();
+        }
+
+        IndexWriter writer = new IndexWriter(index_dir, getAnalyzer(),
                 wipe_existing);
 
         /* Set maximum number of terms to index if present in dspace.cfg */
@@ -303,7 +436,6 @@
 
         return writer;
     }
-
     /**
      * close up the indexing engine
      */
@@ -395,6 +527,8 @@
             Item target = (Item) i.next();
 
             writeItemIndex(c, writer, target);
+            
+            target.decache();
         }
     }
 
@@ -404,24 +538,22 @@
     private static void writeCommunityIndex(Context c, IndexWriter writer,
             Community target) throws SQLException, IOException
     {
-        // build a hash for the metadata
-        HashMap textvalues = new HashMap();
+    // get the handle
+        String handle = HandleManager.findHandle(c, target);
 
-        // get the handle
-        String myhandle = HandleManager.findHandle(c, target);
+        // Create Lucene Document
+        Document doc = createDocument(Constants.COMMUNITY, handle, null);
 
         // and populate it
         String name = target.getMetadata("name");
+        doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
+        doc.add(new Field("default", name, Field.Store.YES, Field.Index.TOKENIZED));
 
-        //        String description = target.getMetadata("short_description");
-        //        String intro_text = target.getMetadata("introductory_text");
-        textvalues.put("name", name);
-
-        //        textvalues.put("description", description);
-        //        textvalues.put("intro_text", intro_text );
-        textvalues.put("handletext", myhandle);
+        log.info("Writing Community: " + handle + "to Index");
+
+        // Write to Lucene Index
+        writer.addDocument(doc);
 
-        writeIndexRecord(writer, Constants.COMMUNITY, myhandle, textvalues, "");
     }
 
     /**
@@ -432,25 +564,23 @@
     {
         String location_text = buildCollectionLocationString(c, target);
 
-        // get the handle
+    // get the handle
         String myhandle = HandleManager.findHandle(c, target);
 
-        // build a hash for the metadata
-        HashMap textvalues = new HashMap();
+
+        // Create Lucene Document
+        Document doc = createDocument(Constants.COLLECTION, myhandle, location_text);
 
         // and populate it
         String name = target.getMetadata("name");
+        doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
+        doc.add(new Field("default", name, Field.Store.YES, Field.Index.TOKENIZED));
 
-        //        String description = target.getMetadata("short_description");
-        //        String intro_text = target.getMetadata("introductory_text");
-        textvalues.put("name", name);
-
-        //        textvalues.put("description",description );
-        //        textvalues.put("intro_text", intro_text );
-        textvalues.put("location", location_text);
-        textvalues.put("handletext", myhandle);
+        log.info("Writing Collection: " + myhandle + "to Index");
 
-        writeIndexRecord(writer, Constants.COLLECTION, myhandle, textvalues, "");
+        // Write to Lucene Index
+        writer.addDocument(doc);
+        
     }
 
     /**
@@ -460,10 +590,17 @@
     private static void writeItemIndex(Context c, IndexWriter writer,
             Item myitem) throws SQLException, IOException
     {
-        // FIXME: config reading should happen just once & be cached?  
-        
-        // get the location string (for searching by collection & community)
-        String location_text = buildItemLocationString(c, myitem);
+        // FIXME: config reading should happen just once & be cached?
+
+    // get the location string (for searching by collection & community)
+        String location = buildItemLocationString(c, myitem);
+
+        // firstly, get the handle
+        String handle = HandleManager.findHandle(c, myitem);
+
+        Document doc = createDocument(Constants.ITEM, handle, location);
+
+        log.info("Building Item: " + handle);
 
         // read in indexes from the config
         ArrayList indexes = new ArrayList();
@@ -477,9 +614,6 @@
         int j;
         int k = 0;
 
-        // initialize hash to be built
-        HashMap textvalues = new HashMap();
-
         if (indexes.size() > 0)
         {
             ArrayList fields = new ArrayList();
@@ -500,7 +634,7 @@
                 // Get the schema, element and qualifier for the index
                 // TODO: Should check valid schema, element, qualifier?
                 String[] parts = configLine[1].split("\\.");
-                
+
                 switch (parts.length)
                 {
                 case 3:
@@ -515,7 +649,7 @@
                     throw new RuntimeException(
                             "Malformed configuration line: search.index." + i);
                 }
-                
+
                 // extract metadata (ANY is wildcard from Item class)
                 if (qualifier!= null && qualifier.equals("*"))
                 {
@@ -558,123 +692,111 @@
             // build the hash
             for (int i = 0; i < fields.size(); i++)
             {
-                textvalues.put((String) fields.get(i), (String) content.get(i));
+            
+            	doc.add(
+        		new Field(
+                                (String) fields.get(i),
+        			(String) content.get(i),
+        			Field.Store.YES, Field.Index.TOKENIZED
+                ));
+
+        	doc.add(new Field("default", (String) content.get(i), Field.Store.YES, Field.Index.TOKENIZED));
+
             }
 
-            textvalues.put("location", location_text);
         }
         else
         // if no search indexes found in cfg file, for backward compatibility
         {
+
             // extract metadata (ANY is wildcard from Item class)
             DCValue[] authors = myitem.getDC("contributor", Item.ANY, Item.ANY);
-            DCValue[] creators = myitem.getDC("creator", Item.ANY, Item.ANY);
-            DCValue[] titles = myitem.getDC("title", Item.ANY, Item.ANY);
-            DCValue[] keywords = myitem.getDC("subject", Item.ANY, Item.ANY);
-
-            DCValue[] abstracts = myitem.getDC("description", "abstract",
-                    Item.ANY);
-            DCValue[] sors = myitem.getDC("description",
-                    "statementofresponsibility", Item.ANY);
-            DCValue[] series = myitem.getDC("relation", "ispartofseries",
-                    Item.ANY);
-            DCValue[] tocs = myitem.getDC("description", "tableofcontents",
-                    Item.ANY);
-            DCValue[] mimetypes = myitem.getDC("format", "mimetype", Item.ANY);
-            DCValue[] sponsors = myitem.getDC("description", "sponsorship",
-                    Item.ANY);
-            DCValue[] identifiers = myitem.getDC("identifier", Item.ANY,
-                    Item.ANY);
-
-            // put them all from an array of strings to one string for writing
-            // out
-            String author_text = "";
-            String title_text = "";
-            String keyword_text = "";
-
-            String abstract_text = "";
-            String series_text = "";
-            String mime_text = "";
-            String sponsor_text = "";
-            String id_text = "";
-
-            // pack all of the arrays of DCValues into plain text strings for
-            // the
-            // indexer
             for (j = 0; j < authors.length; j++)
             {
-                author_text = new String(author_text + authors[j].value + " ");
+        	doc.add(new Field("author", authors[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", authors[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] creators = myitem.getDC("creator", Item.ANY, Item.ANY);
             for (j = 0; j < creators.length; j++) //also authors
             {
-                author_text = new String(author_text + creators[j].value + " ");
+                doc.add(new Field("author", creators[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+                doc.add(new Field("default", creators[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] sors = myitem.getDC("description",
+                    "statementofresponsibility", Item.ANY);
             for (j = 0; j < sors.length; j++) //also authors
             {
-                author_text = new String(author_text + sors[j].value + " ");
+                doc.add(new Field("author", sors[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+                doc.add(new Field("default", sors[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] titles = myitem.getDC("title", Item.ANY, Item.ANY);
             for (j = 0; j < titles.length; j++)
             {
-                title_text = new String(title_text + titles[j].value + " ");
+        	doc.add(new Field("title", titles[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", titles[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] keywords = myitem.getDC("subject", Item.ANY, Item.ANY);
             for (j = 0; j < keywords.length; j++)
             {
-                keyword_text = new String(keyword_text + keywords[j].value
-                        + " ");
+        	doc.add(new Field("keyword", keywords[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", keywords[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] abstracts = myitem.getDC("description", "abstract", Item.ANY);
             for (j = 0; j < abstracts.length; j++)
             {
-                abstract_text = new String(abstract_text + abstracts[j].value
-                        + " ");
+        	doc.add(new Field("abstract", abstracts[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", abstracts[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] tocs = myitem.getDC("description", "tableofcontents",
+                    Item.ANY);
             for (j = 0; j < tocs.length; j++)
             {
-                abstract_text = new String(abstract_text + tocs[j].value + " ");
+        	doc.add(new Field("abstract", tocs[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", tocs[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] series = myitem.getDC("relation", "ispartofseries",
+                    Item.ANY);
             for (j = 0; j < series.length; j++)
             {
-                series_text = new String(series_text + series[j].value + " ");
+        	doc.add(new Field("series", series[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", series[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] mimetypes = myitem.getDC("format", "mimetype", Item.ANY);
             for (j = 0; j < mimetypes.length; j++)
             {
-                mime_text = new String(mime_text + mimetypes[j].value + " ");
+        	doc.add(new Field("mimetype", mimetypes[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", mimetypes[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] sponsors = myitem.getDC("description", "sponsorship",
+                    Item.ANY);
             for (j = 0; j < sponsors.length; j++)
             {
-                sponsor_text = new String(sponsor_text + sponsors[j].value
-                        + " ");
+        	doc.add(new Field("sponsor", sponsors[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", sponsors[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
+            DCValue[] identifiers = myitem.getDC("identifier", Item.ANY,
+                    Item.ANY);
             for (j = 0; j < identifiers.length; j++)
             {
-                id_text = new String(id_text + identifiers[j].value + " ");
+        	doc.add(new Field("identifier", identifiers[j].value, Field.Store.YES, Field.Index.TOKENIZED));
+        	doc.add(new Field("default", identifiers[j].value, Field.Store.YES, Field.Index.TOKENIZED));
             }
 
-            // build the hash
-            textvalues.put("author", author_text);
-            textvalues.put("title", title_text);
-            textvalues.put("keyword", keyword_text);
-            textvalues.put("location", location_text);
-            textvalues.put("abstract", abstract_text);
-
-            textvalues.put("series", series_text);
-            textvalues.put("mimetype", mime_text);
-            textvalues.put("sponsor", sponsor_text);
-            textvalues.put("identifier", id_text);
         }
 
-        // now get full text of any bitstreams in the TEXT bundle
-        String extractedText = "";
+        log.info("  Added Metadata");
 
+        // now get full text of any bitstreams in the TEXT bundle
         // trundle through the bundles
         Bundle[] myBundles = myitem.getBundles();
 
@@ -692,30 +814,12 @@
                     {
                         InputStreamReader is = new InputStreamReader(
                                 myBitstreams[j].retrieve()); // get input
-                        // stream
-                        StringBuffer sb = new StringBuffer();
-                        char[] charBuffer = new char[1024];
-
-                        while (true)
-                        {
-                            int bytesIn = is.read(charBuffer);
-
-                            if (bytesIn == -1)
-                            {
-                                break;
-                            }
-
-                            if (bytesIn > 0)
-                            {
-                                sb.append(charBuffer, 0, bytesIn);
-                            }
-                        }
 
-                        // now sb has the full text - tack on to fullText string
-                        extractedText = extractedText.concat(new String(sb));
+                        // Add each InputStream to the Indexed Document (Acts like an Append)
+                        doc.add(new Field("default", is));
+
+                        log.info("  Added BitStream: " + myBitstreams[j].getStoreNumber() + "   " + myBitstreams[j].getName());
 
-                        //                        System.out.println("Found extracted text!\n" + new
-                        // String(sb));
                     }
                     catch (AuthorizeException e)
                     {
@@ -725,68 +829,44 @@
             }
         }
 
-        // lastly, get the handle
-        String itemhandle = HandleManager.findHandle(c, myitem);
-        textvalues.put("handletext", itemhandle);
+        log.info("  Writing Item: " + handle + "to Index");
 
-        if (log.isDebugEnabled())
-        {
-            log.debug(LogManager.getHeader(c, "write_index", "handle=" +itemhandle));
-            log.debug(textvalues.toString());
-        }
+        log.debug(" " + doc.toString());
+
+        // index the document
+        writer.addDocument(doc);
 
-        // write out the metatdata (for scalability, using hash instead of
-        // individual strings)
-        writeIndexRecord(writer, Constants.ITEM, itemhandle, textvalues,
-                extractedText);
     }
 
-    /**
-     * writeIndexRecord() creates a document from its args and writes it out to
-     * the index that is opened
-     */
-    private static void writeIndexRecord(IndexWriter iw, int type,
-            String handle, HashMap textvalues, String extractedText)
-            throws IOException
-    {
-        Document doc = new Document();
-        Integer ty = new Integer(type);
-        String fulltext = "";
+    private static Document createDocument(int type, String handle, String location){
 
-        // do id, type, handle first
-        doc.add(new Field("type", ty.toString(), Field.Store.YES, Field.Index.NO));
+	Document doc = new Document();
+
+        // do location, type, handle first
+        doc.add(new Field("type", Integer.toString(type), Field.Store.YES, Field.Index.NO));
 
         // want to be able to search for handle, so use keyword
         // (not tokenized, but it is indexed)
         if (handle != null)
         {
+            // ??? not sure what the "handletext" field is but it was there in writeItemIndex ???
+            doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.TOKENIZED));
+
+            // want to be able to search for handle, so use keyword
+            // (not tokenized, but it is indexed)
             doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.UN_TOKENIZED));
-        }
 
-        // now iterate through the hash, building full text string
-        // and index all values
-        Iterator i = textvalues.keySet().iterator();
+            // add to full text index
+            doc.add(new Field("default", handle, Field.Store.YES, Field.Index.TOKENIZED));
+        }
 
-        while (i.hasNext())
+        if(location != null)
         {
-            String key = (String) i.next();
-            String value = (String) textvalues.get(key);
-
-            fulltext = fulltext + " " + value;
-
-            if (value != null)
-            {
-                doc.add(new Field(key, value, Field.Store.YES, Field.Index.TOKENIZED));
-            }
+            doc.add(new Field("location", location, Field.Store.YES, Field.Index.TOKENIZED));
+    	    doc.add(new Field("default", location, Field.Store.YES, Field.Index.TOKENIZED));
         }
 
-        fulltext = fulltext.concat(extractedText);
-
-        //        System.out.println("Full Text:\n" + fulltext + "------------\n\n");
-        // add the full text
-        doc.add(new Field("default", fulltext, Field.Store.YES, Field.Index.TOKENIZED));
-
-        // index the document
-        iw.addDocument(doc);
+        return doc;
     }
+
 }