### Eclipse Workspace Patch 1.0 #P dspace Index: src/org/dspace/search/DSIndexer.java =================================================================== RCS file: /cvsroot/dspace/dspace/src/org/dspace/search/DSIndexer.java,v retrieving revision 1.41 diff -u -r1.41 DSIndexer.java --- src/org/dspace/search/DSIndexer.java 3 Nov 2006 05:01:31 -0000 1.41 +++ src/org/dspace/search/DSIndexer.java 13 Dec 2006 18:12:29 -0000 @@ -39,13 +39,17 @@ */ package org.dspace.search; +import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.sql.SQLException; import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.PosixParser; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; @@ -62,7 +66,6 @@ import org.dspace.content.DSpaceObject; import org.dspace.content.Item; import org.dspace.content.ItemIterator; -import org.dspace.content.MetadataSchema; import org.dspace.core.ConfigurationManager; import org.dspace.core.Constants; import org.dspace.core.Context; @@ -82,7 +85,7 @@ // TODO: Support for analyzers per language, or multiple indices /** The analyzer for this DSpace instance */ private static Analyzer analyzer = null; - + /** * IndexItem() adds a single item to the index */ @@ -122,7 +125,7 @@ /** * unIndex removes an Item, Collection, or Community only works if the * DSpaceObject has a handle (uses the handle for its unique ID) - * + * * @param dso * DSpace Object, can be Community, Item, or Collection */ @@ -165,7 +168,7 @@ /** * reIndexContent removes something from the index, then re-indexes it - * + * * @param c context object * @param dso object to re-index */ @@ -178,7 +181,7 @@ /** * create full index - wiping old index - * + * * @param c context to use */ public static void createIndex(Context c) throws SQLException, IOException @@ -190,7 +193,83 @@ indexAllCommunities(c, writer); indexAllCollections(c, writer); indexAllItems(c, writer); + } + finally + { + closeIndex(c, writer); + } + } + + public static void updateIndex(Context c, boolean test) throws SQLException, IOException + { + + File index_dir = new File(System.getProperty("java.io.tmpdir"), System.currentTimeMillis() + "index"); + + IndexWriter writer = openIndex(c, index_dir, true); + + try + { + indexAllCommunities(c, writer); + indexAllCollections(c, writer); + indexAllItems(c, writer); + + } + finally + { + closeIndex(c, writer); + } + + if(!test) + { + IndexReader reader = IndexReader.open(index_dir); + + try + { + writer = openIndex(c, true); + writer.addIndexes( new IndexReader[]{ reader } ); + } + finally + { + closeIndex(c, writer); + reader.close(); + recursiveDelete(index_dir); + } + } + + + } + + /** + * Recursively deletes contents of a directory + * + * (Note: Ant libraries do this, would be nice to have Ant as a tool in DSpace runtime) + * @param directory Directory to Delete. + */ + private static void recursiveDelete(File directory){ + File[] children = directory.listFiles(); + for(int i = 0 ; i < children.length ; i++){ + if(children[i].isFile()) + children[i].delete(); + else if (children[i].isDirectory()) + recursiveDelete(children[i]); + } + directory.delete(); + } + + /** + * Optimize the existing index. + * + * @param c Users Context + * @throws SQLException + * @throws IOException + */ + public static void optimizeIndex(Context c) throws SQLException, IOException + { + IndexWriter writer = openIndex(c, false); + + try + { // optimize the index - important to do regularly to reduce // filehandle // usage @@ -205,34 +284,76 @@ /** * When invoked as a command-line tool, (re)-builds the whole index - * + * * @param args * the command-line arguments, none used */ public static void main(String[] args) throws Exception { + // create an options object and populate it + CommandLineParser parser = new PosixParser(); + + Options options = new Options(); + options + .addOption("r", "remove", true, + "-r : remove an item handle from index based on its handle"); + options.addOption("o", "optimize", false, "optimize existing index"); + options.addOption("c", "create", false, + "build new index in search directory destroying existing index first"); + options.addOption("u", "update", false, + "build new index in tmp directory and replace search directory when complete"); + + options + .addOption( + "t", + "test", + false, + "If used in conjunctionwith 'u' will only build new index, and will not replace old with new at end"); + + options.addOption("h", "help", false, "Print this help message"); + + CommandLine line = parser.parse(options, args); + + if(!(line.hasOption("r") || line.hasOption("o") || line.hasOption("c") || line.hasOption("u"))) + { + // automatically generate the help statement + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp( "DSIndexer", options ); + System.exit(1); + } + Context c = new Context(); + c.setIgnoreAuthorization(true); + // for testing, pass in a handle of something to remove... - if ((args.length == 2) && (args[0].equals("remove"))) - { - unIndexContent(c, args[1]); + if (line.hasOption("r")){ + log.info("Removing " + line.getOptionValue("r") + " from Index"); + unIndexContent(c, line.getOptionValue("r")); + } else if(line.hasOption("u")){ + log.info("Updating Existing Index"); + updateIndex(c, line.hasOption("t")); } - else + else if(line.hasOption("c")) { - c.setIgnoreAuthorization(true); - + log.info("Creating New Index"); createIndex(c); + } - System.out.println("Done with indexing"); + if (line.hasOption("o")) + { + log.info("Optimizing Index"); + optimizeIndex(c); } + + log.info("Done with indexing"); } /** * Get the Lucene analyzer to use according to current configuration (or * default). TODO: Should have multiple analyzers (and maybe indices?) for * multi-lingual DSpaces. - * + * * @return Analyzer to use * @throws IllegalStateException * if the configured analyzer can't be instantiated @@ -267,23 +388,35 @@ return analyzer; } - - + + //////////////////////////////////// // Private //////////////////////////////////// /** - * prepare index, opening writer, and wiping out existing index if necessary + * prepare exposed index, opening writer, and wiping out existing index if necessary */ private static IndexWriter openIndex(Context c, boolean wipe_existing) throws IOException { - IndexWriter writer; - String index_directory = ConfigurationManager.getProperty("search.dir"); + return openIndex(c, new File(index_directory), wipe_existing); + } + + /** + * prepare index in specidified location, opening writer, and wiping out existing index if necessary + */ + private static IndexWriter openIndex(Context c, File index_dir, boolean wipe_existing) + throws IOException + { - writer = new IndexWriter(index_directory, getAnalyzer(), + if(!index_dir.exists()) + { + index_dir.mkdirs(); + } + + IndexWriter writer = new IndexWriter(index_dir, getAnalyzer(), wipe_existing); /* Set maximum number of terms to index if present in dspace.cfg */ @@ -303,7 +436,6 @@ return writer; } - /** * close up the indexing engine */ @@ -395,6 +527,8 @@ Item target = (Item) i.next(); writeItemIndex(c, writer, target); + + target.decache(); } } @@ -404,24 +538,22 @@ private static void writeCommunityIndex(Context c, IndexWriter writer, Community target) throws SQLException, IOException { - // build a hash for the metadata - HashMap textvalues = new HashMap(); + // get the handle + String handle = HandleManager.findHandle(c, target); - // get the handle - String myhandle = HandleManager.findHandle(c, target); + // Create Lucene Document + Document doc = createDocument(Constants.COMMUNITY, handle, null); // and populate it String name = target.getMetadata("name"); + doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", name, Field.Store.YES, Field.Index.TOKENIZED)); - // String description = target.getMetadata("short_description"); - // String intro_text = target.getMetadata("introductory_text"); - textvalues.put("name", name); - - // textvalues.put("description", description); - // textvalues.put("intro_text", intro_text ); - textvalues.put("handletext", myhandle); + log.info("Writing Community: " + handle + "to Index"); + + // Write to Lucene Index + writer.addDocument(doc); - writeIndexRecord(writer, Constants.COMMUNITY, myhandle, textvalues, ""); } /** @@ -432,25 +564,23 @@ { String location_text = buildCollectionLocationString(c, target); - // get the handle + // get the handle String myhandle = HandleManager.findHandle(c, target); - // build a hash for the metadata - HashMap textvalues = new HashMap(); + + // Create Lucene Document + Document doc = createDocument(Constants.COLLECTION, myhandle, location_text); // and populate it String name = target.getMetadata("name"); + doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", name, Field.Store.YES, Field.Index.TOKENIZED)); - // String description = target.getMetadata("short_description"); - // String intro_text = target.getMetadata("introductory_text"); - textvalues.put("name", name); - - // textvalues.put("description",description ); - // textvalues.put("intro_text", intro_text ); - textvalues.put("location", location_text); - textvalues.put("handletext", myhandle); + log.info("Writing Collection: " + myhandle + "to Index"); - writeIndexRecord(writer, Constants.COLLECTION, myhandle, textvalues, ""); + // Write to Lucene Index + writer.addDocument(doc); + } /** @@ -460,10 +590,17 @@ private static void writeItemIndex(Context c, IndexWriter writer, Item myitem) throws SQLException, IOException { - // FIXME: config reading should happen just once & be cached? - - // get the location string (for searching by collection & community) - String location_text = buildItemLocationString(c, myitem); + // FIXME: config reading should happen just once & be cached? + + // get the location string (for searching by collection & community) + String location = buildItemLocationString(c, myitem); + + // firstly, get the handle + String handle = HandleManager.findHandle(c, myitem); + + Document doc = createDocument(Constants.ITEM, handle, location); + + log.info("Building Item: " + handle); // read in indexes from the config ArrayList indexes = new ArrayList(); @@ -477,9 +614,6 @@ int j; int k = 0; - // initialize hash to be built - HashMap textvalues = new HashMap(); - if (indexes.size() > 0) { ArrayList fields = new ArrayList(); @@ -500,7 +634,7 @@ // Get the schema, element and qualifier for the index // TODO: Should check valid schema, element, qualifier? String[] parts = configLine[1].split("\\."); - + switch (parts.length) { case 3: @@ -515,7 +649,7 @@ throw new RuntimeException( "Malformed configuration line: search.index." + i); } - + // extract metadata (ANY is wildcard from Item class) if (qualifier!= null && qualifier.equals("*")) { @@ -558,123 +692,111 @@ // build the hash for (int i = 0; i < fields.size(); i++) { - textvalues.put((String) fields.get(i), (String) content.get(i)); + + doc.add( + new Field( + (String) fields.get(i), + (String) content.get(i), + Field.Store.YES, Field.Index.TOKENIZED + )); + + doc.add(new Field("default", (String) content.get(i), Field.Store.YES, Field.Index.TOKENIZED)); + } - textvalues.put("location", location_text); } else // if no search indexes found in cfg file, for backward compatibility { + // extract metadata (ANY is wildcard from Item class) DCValue[] authors = myitem.getDC("contributor", Item.ANY, Item.ANY); - DCValue[] creators = myitem.getDC("creator", Item.ANY, Item.ANY); - DCValue[] titles = myitem.getDC("title", Item.ANY, Item.ANY); - DCValue[] keywords = myitem.getDC("subject", Item.ANY, Item.ANY); - - DCValue[] abstracts = myitem.getDC("description", "abstract", - Item.ANY); - DCValue[] sors = myitem.getDC("description", - "statementofresponsibility", Item.ANY); - DCValue[] series = myitem.getDC("relation", "ispartofseries", - Item.ANY); - DCValue[] tocs = myitem.getDC("description", "tableofcontents", - Item.ANY); - DCValue[] mimetypes = myitem.getDC("format", "mimetype", Item.ANY); - DCValue[] sponsors = myitem.getDC("description", "sponsorship", - Item.ANY); - DCValue[] identifiers = myitem.getDC("identifier", Item.ANY, - Item.ANY); - - // put them all from an array of strings to one string for writing - // out - String author_text = ""; - String title_text = ""; - String keyword_text = ""; - - String abstract_text = ""; - String series_text = ""; - String mime_text = ""; - String sponsor_text = ""; - String id_text = ""; - - // pack all of the arrays of DCValues into plain text strings for - // the - // indexer for (j = 0; j < authors.length; j++) { - author_text = new String(author_text + authors[j].value + " "); + doc.add(new Field("author", authors[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", authors[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] creators = myitem.getDC("creator", Item.ANY, Item.ANY); for (j = 0; j < creators.length; j++) //also authors { - author_text = new String(author_text + creators[j].value + " "); + doc.add(new Field("author", creators[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", creators[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] sors = myitem.getDC("description", + "statementofresponsibility", Item.ANY); for (j = 0; j < sors.length; j++) //also authors { - author_text = new String(author_text + sors[j].value + " "); + doc.add(new Field("author", sors[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", sors[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] titles = myitem.getDC("title", Item.ANY, Item.ANY); for (j = 0; j < titles.length; j++) { - title_text = new String(title_text + titles[j].value + " "); + doc.add(new Field("title", titles[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", titles[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] keywords = myitem.getDC("subject", Item.ANY, Item.ANY); for (j = 0; j < keywords.length; j++) { - keyword_text = new String(keyword_text + keywords[j].value - + " "); + doc.add(new Field("keyword", keywords[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", keywords[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] abstracts = myitem.getDC("description", "abstract", Item.ANY); for (j = 0; j < abstracts.length; j++) { - abstract_text = new String(abstract_text + abstracts[j].value - + " "); + doc.add(new Field("abstract", abstracts[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", abstracts[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] tocs = myitem.getDC("description", "tableofcontents", + Item.ANY); for (j = 0; j < tocs.length; j++) { - abstract_text = new String(abstract_text + tocs[j].value + " "); + doc.add(new Field("abstract", tocs[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", tocs[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] series = myitem.getDC("relation", "ispartofseries", + Item.ANY); for (j = 0; j < series.length; j++) { - series_text = new String(series_text + series[j].value + " "); + doc.add(new Field("series", series[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", series[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] mimetypes = myitem.getDC("format", "mimetype", Item.ANY); for (j = 0; j < mimetypes.length; j++) { - mime_text = new String(mime_text + mimetypes[j].value + " "); + doc.add(new Field("mimetype", mimetypes[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", mimetypes[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] sponsors = myitem.getDC("description", "sponsorship", + Item.ANY); for (j = 0; j < sponsors.length; j++) { - sponsor_text = new String(sponsor_text + sponsors[j].value - + " "); + doc.add(new Field("sponsor", sponsors[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", sponsors[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } + DCValue[] identifiers = myitem.getDC("identifier", Item.ANY, + Item.ANY); for (j = 0; j < identifiers.length; j++) { - id_text = new String(id_text + identifiers[j].value + " "); + doc.add(new Field("identifier", identifiers[j].value, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", identifiers[j].value, Field.Store.YES, Field.Index.TOKENIZED)); } - // build the hash - textvalues.put("author", author_text); - textvalues.put("title", title_text); - textvalues.put("keyword", keyword_text); - textvalues.put("location", location_text); - textvalues.put("abstract", abstract_text); - - textvalues.put("series", series_text); - textvalues.put("mimetype", mime_text); - textvalues.put("sponsor", sponsor_text); - textvalues.put("identifier", id_text); } - // now get full text of any bitstreams in the TEXT bundle - String extractedText = ""; + log.info(" Added Metadata"); + // now get full text of any bitstreams in the TEXT bundle // trundle through the bundles Bundle[] myBundles = myitem.getBundles(); @@ -692,30 +814,12 @@ { InputStreamReader is = new InputStreamReader( myBitstreams[j].retrieve()); // get input - // stream - StringBuffer sb = new StringBuffer(); - char[] charBuffer = new char[1024]; - - while (true) - { - int bytesIn = is.read(charBuffer); - - if (bytesIn == -1) - { - break; - } - - if (bytesIn > 0) - { - sb.append(charBuffer, 0, bytesIn); - } - } - // now sb has the full text - tack on to fullText string - extractedText = extractedText.concat(new String(sb)); + // Add each InputStream to the Indexed Document (Acts like an Append) + doc.add(new Field("default", is)); + + log.info(" Added BitStream: " + myBitstreams[j].getStoreNumber() + " " + myBitstreams[j].getName()); - // System.out.println("Found extracted text!\n" + new - // String(sb)); } catch (AuthorizeException e) { @@ -725,68 +829,44 @@ } } - // lastly, get the handle - String itemhandle = HandleManager.findHandle(c, myitem); - textvalues.put("handletext", itemhandle); + log.info(" Writing Item: " + handle + "to Index"); - if (log.isDebugEnabled()) - { - log.debug(LogManager.getHeader(c, "write_index", "handle=" +itemhandle)); - log.debug(textvalues.toString()); - } + log.debug(" " + doc.toString()); + + // index the document + writer.addDocument(doc); - // write out the metatdata (for scalability, using hash instead of - // individual strings) - writeIndexRecord(writer, Constants.ITEM, itemhandle, textvalues, - extractedText); } - /** - * writeIndexRecord() creates a document from its args and writes it out to - * the index that is opened - */ - private static void writeIndexRecord(IndexWriter iw, int type, - String handle, HashMap textvalues, String extractedText) - throws IOException - { - Document doc = new Document(); - Integer ty = new Integer(type); - String fulltext = ""; + private static Document createDocument(int type, String handle, String location){ - // do id, type, handle first - doc.add(new Field("type", ty.toString(), Field.Store.YES, Field.Index.NO)); + Document doc = new Document(); + + // do location, type, handle first + doc.add(new Field("type", Integer.toString(type), Field.Store.YES, Field.Index.NO)); // want to be able to search for handle, so use keyword // (not tokenized, but it is indexed) if (handle != null) { + // ??? not sure what the "handletext" field is but it was there in writeItemIndex ??? + doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.TOKENIZED)); + + // want to be able to search for handle, so use keyword + // (not tokenized, but it is indexed) doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.UN_TOKENIZED)); - } - // now iterate through the hash, building full text string - // and index all values - Iterator i = textvalues.keySet().iterator(); + // add to full text index + doc.add(new Field("default", handle, Field.Store.YES, Field.Index.TOKENIZED)); + } - while (i.hasNext()) + if(location != null) { - String key = (String) i.next(); - String value = (String) textvalues.get(key); - - fulltext = fulltext + " " + value; - - if (value != null) - { - doc.add(new Field(key, value, Field.Store.YES, Field.Index.TOKENIZED)); - } + doc.add(new Field("location", location, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("default", location, Field.Store.YES, Field.Index.TOKENIZED)); } - fulltext = fulltext.concat(extractedText); - - // System.out.println("Full Text:\n" + fulltext + "------------\n\n"); - // add the full text - doc.add(new Field("default", fulltext, Field.Store.YES, Field.Index.TOKENIZED)); - - // index the document - iw.addDocument(doc); + return doc; } + }