CREATE TABLE document (docid varchar NOT NULL,
title varchar,
abstract varchar NULL,
PRIMARY KEY (docid) );
CREATE TABLE reference (docid varchar NOT NULL,
refid varchar NOT NULL,
PRIMARY KEY (docid, refid),
FOREIGN KEY (docid) REFERENCES document(docid),
FOREIGN KEY (refid) REFERENCES document(docid) );
CREATE TABLE document (docid varchar NOT NULL,
title varchar,
abstract varchar NULL );
CREATE TABLE tempdoc (docid varchar NOT NULL,
title varchar );
CREATE TABLE reference (docid varchar NOT NULL,
refid varchar NOT NULL );
CREATE TABLE tempref (docid varchar NOT NULL,
refid varchar NOT NULL );
SELECT refid, MAX(title) FROM tempdoc WHERE refid NOT IN (SELECT docid FROM document) GROUP BY refid;
SELECT DISTINCT docid, refid FROM tempref WHERE refid IN (SELECT docid FROM document);
I also wanted to ask about performance differences across database modes. Right now I'm constructing the database in file mode and just storing it on the same machine that's running the application. I can't set up an actual server to offload the database maintenance onto a second machine, but
CREATE TABLE reference (docid varchar NOT NULL, refid varchar NOT NULL );CREATE TABLE reference_reverse (refid varchar NOT NULL,
docid varchar NOT NULL );Also, looking at your schema, and your saying that you are indexing the tables, you may want to consider doingCREATE TABLE reference (docid varchar NOT NULL,refid varchar NOT NULL );CREATE TABLE reference_reverse (refid varchar NOT NULL,docid varchar NOT NULL );and only having a primary key index on each one.This will make certain lookups faster, since the required data will be grouped closer together.
Statement docType = gConn.createStatement();
docType.execute("ALTER TABLE doctype ADD PRIMARY KEY (abbrev);");
docType.close();
Statement refDoc = gConn.createStatement();
refDoc.execute(
"CREATE TABLE indexDoc ("
+ " docid varchar,"
+ " title varchar,"
+ " content varchar,"
+ " sourceTitle varchar,"
+ " startPage varchar,"
+ " endPage varchar,"
+ " year varchar,"
+ " indexed bit NOT NULL DEFAULT 0 );");
refDoc.executeUpdate("INSERT INTO indexDoc (docid, indexed) ( SELECT docid, 1 FROM document );");
refDoc.executeUpdate(
"INSERT INTO indexDoc (docid, title, content, sourceTitle, startPage, endPage, year) ("
+ " SELECT refid, MAX(title), MAX(content), MAX(sourceTitle), MAX(startPage), MAX(endPage), MAX(year)"
+ " FROM refdoc"
+ " GROUP BY refid );");
ResultSet refDocs = refDoc.executeQuery("SELECT docid, MAX(title), MAX(content)"
+ " FROM indexDoc"
+ " GROUP BY docid"
+ " HAVING MAX(indexed) = 0;");
while (refDocs.next())
{
Document doc = new Document();
doc.add(new Field("ID", refDocs.getString(1), FieldFactory.field(false)));
doc.add(new Field("title", refDocs.getString(2), FieldFactory.field(false)));
doc.add(new Field("content", refDocs.getString(3), FieldFactory.field(true)));
writer.addDocument(doc);
}
refDocs.close();
refDoc.executeUpdate(
"INSERT INTO document (docid, title, sourceTitle, startPage, endPage, year) ("
+ " SELECT docid, MAX(title), MAX(sourceTitle), MAX(startPage), MAX(endPage), MAX(year)"
+ " FROM indexDoc"
+ " GROUP BY docid"
+ " HAVING MAX(indexed) = 0 );");
refDoc.execute("DROP TABLE indexDoc;");
refDoc.execute("DROP TABLE refdoc;");
refDoc.close();
Statement doc = gConn.createStatement();
doc.execute("ALTER TABLE document ADD PRIMARY KEY (docid);");
doc.execute("ALTER TABLE document ADD FOREIGN KEY (docType) REFERENCES doctype(abbrev);");
doc.close();
Statement ref = gConn.createStatement();
ref.execute("CREATE TABLE reference AS ( SELECT DISTINCT docid, refid FROM tempref );");
ref.execute("DROP TABLE tempref;");
ref.execute("ALTER TABLE reference MODIFY COLUMN docid varchar NOT NULL;");
ref.execute("ALTER TABLE reference MODIFY COLUMN refid varchar NOT NULL;");
ref.execute("ALTER TABLE reference ADD PRIMARY KEY (docid, refid);");
ref.execute("ALTER TABLE reference ADD FOREIGN KEY (docid) REFERENCES document(docid);");
ref.execute("ALTER TABLE reference ADD FOREIGN KEY (refid) REFERENCES document(docid);");
ref.close();SELECT docid, MAX(title), MAX(content)
FROM indexDocGROUP BY docidHAVING MAX(indexed) = 0;INSERT INTO document (docid, title, sourceTitle, startPage, endPage, year) (SELECT docid, MAX(title), MAX(sourceTitle), MAX(startPage), MAX(endPage), MAX(year)FROM indexDocGROUP BY docid
HAVING MAX(indexed) = 0 );My suspicion is that nesting a SELECT inside any other function is going to cause these kinds of errors, and I will need to instead retrieve a ResultSet and use a PreparedStatement to batch-INSERT into tables in chunks of ~10,000 or so.
Statement docType = gConn.createStatement();
docType.execute("ALTER TABLE doctype ADD PRIMARY KEY (abbrev);");
docType.close();
Statement refDoc = gConn.createStatement();
refDoc.execute(
"CREATE TABLE indexDoc ("
+ " docid varchar,"
+ " title varchar,"
+ " content varchar,"
+ " sourceTitle varchar,"
+ " startPage varchar,"
+ " endPage varchar,"
+ " year varchar,"
+ " indexed bit NOT NULL DEFAULT 0 );");
refDoc.executeUpdate("INSERT INTO indexDoc (docid, indexed) ( SELECT docid, 1 FROM document );");
refDoc.executeUpdate(
"INSERT INTO indexDoc (docid, title, content, sourceTitle, startPage, endPage, year) ("
+ " SELECT refid, MAX(title), MAX(content), MAX(sourceTitle), MAX(startPage), MAX(endPage), MAX(year)"
+ " FROM refdoc"
+ " GROUP BY refid );");
ResultSet refDocs =
refDoc.executeQuery("SELECT docid, MAX(title), MAX(content), MAX(sourceTitle), MAX(startPage), MAX(endPage), MAX(year)"
+ " FROM indexDoc"
+ " GROUP BY docid"
+ " HAVING MAX(indexed) = 0;");
PreparedStatement insert =
gConn.prepareStatement(
"INSERT INTO document (docid, title, sourceTitle, startPage, endPage, year) VALUES (?, ?, ?, ?, ?, ?);");
int count = 0;
while (refDocs.next())
{
Document doc = new Document();
doc.add(new Field("ID", refDocs.getString(1), FieldFactory.field(false)));
doc.add(new Field("title", refDocs.getString(2), FieldFactory.field(false)));
doc.add(new Field("content", refDocs.getString(3), FieldFactory.field(true)));
writer.addDocument(doc);
insert.setString(1, refDocs.getString(1));
insert.setString(2, refDocs.getString(2));
insert.setString(3, refDocs.getString(4));
insert.setString(4, refDocs.getString(5));
insert.setString(5, refDocs.getString(6));
insert.setString(6, refDocs.getString(7));
insert.addBatch();
count++;
if(count >= 10000)
{
insert.executeBatch();
}
}
refDocs.close();
if (count > 0)
{
insert.executeBatch();
}
insert.close();
refDoc.execute("DROP TABLE indexDoc;");
refDoc.execute("DROP TABLE refdoc;");
refDoc.close();
Statement doc = gConn.createStatement();
doc.execute("ALTER TABLE document ADD PRIMARY KEY (docid);");
doc.execute("ALTER TABLE document ADD FOREIGN KEY (docType) REFERENCES doctype(abbrev);");
doc.close();
Statement ref = gConn.createStatement();
ref.execute("CREATE TABLE reference AS ( SELECT DISTINCT docid, refid FROM tempref );");
ref.execute("DROP TABLE tempref;");
ref.execute("ALTER TABLE reference MODIFY COLUMN docid varchar NOT NULL;");
ref.execute("ALTER TABLE reference MODIFY COLUMN refid varchar NOT NULL;");
ref.execute("ALTER TABLE reference ADD PRIMARY KEY (docid, refid);");
ref.execute("ALTER TABLE reference ADD FOREIGN KEY (docid) REFERENCES document(docid);");
ref.execute("ALTER TABLE reference ADD FOREIGN KEY (refid) REFERENCES document(docid);");
ref.close();
SELECT docid, MAX(title), MAX(content), MAX(sourceTitle), MAX(startPage), MAX(endPage), MAX(year)
FROM indexDoc
GROUP BY docidHAVING MAX(indexed) = 0;-vm
/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.222.b10-1.el7_7.x86_64/jre/bin
-startup
plugins/org.eclipse.equinox.launcher_1.5.300.v20190213-1655.jar
--launcher.library
/home/tfielder/.p2/pool/plugins/org.eclipse.equinox.launcher.gtk.linux.x86_64_1.1.1000.v20190125-2016
-product
org.eclipse.epp.package.java.product
-showsplash
org.eclipse.epp.package.common
--launcher.defaultAction
openFile
--launcher.appendVmargs
-vmargs
-Declipse.p2.max.threads=10
-Doomph.update.url=http://download.eclipse.org/oomph/updates/milestone/latest
-Doomph.redirection.index.redirection=index:/->http://git.eclipse.org/c/oomph/org.eclipse.oomph.git/plain/setups/
-Dosgi.requiredJavaVersion=1.8
-Dosgi.instance.area.default=@user.home/eclipse-workspace
-XX:+UseG1GC
-XX:+UseStringDeduplication
--add-modules=ALL-SYSTEM
-Dosgi.requiredJavaVersion=1.8
-Dosgi.dataAreaRequiresExplicitInit=true
-Xms256m
-Xmx60g
--add-modules=ALL-SYSTEM
--
You received this message because you are subscribed to the Google Groups "H2 Database" group.
To unsubscribe from this group and stop receiving emails from it, send an email to h2-database...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/h2-database/8b291c44-39e7-4d20-a9ea-63071fe8fb08%40googlegroups.com.