Hey guys,
Looking forward to seeing all of you tonight and the sessino tomorrow.
I tried training/testing on a largish dataset with ony the Maxent and SVM algorithms, but it runs out of memory anyway, transcript included below.
I create the corpus vector using some database queries (in the Agenda.* methods; see below), but the train_models is the 'factory-installed' method.
As you can see, I am training on 36k documents (all one or two line TV item summaries), and testing on 4k documents.
The machine has 10GB memory, so it should be able to handle it ;-)
-- Wouter
> library(RTextTools)
...
> source("amcat.R")
Loading required package: DBI
Loading required package: rJava
> trainjobs = c(5860,5767,4867,4707,4191,4126,4079,4016,2571)
> testjobs = c(5807, 4866)
>
> connection = connect("app", "secretpassword")
>
> train = Agenda.getCodedArticles(connection, trainjobs)
> test = Agenda.getCodedArticles(connection, testjobs)
> dim(train)
[1] 36662 5
> dim(test)
[1] 3996 5
> c = Agenda.getCorpus(train, test, language="fr")
> models = train_models(c, algorithms=c("MAXENT", "SVM"))
Training the new model...
preparing for estimation...done
number of samples = 36662
number of features = 140291
calculating empirical expectation...done
performing LMVM
0 logl(err) = -5.472271 (0.9981)
1 logl(err) = -5.372594 (0.4198)
2 logl(err) = -4.937689 (0.3357)
3 logl(err) = -2.895228 (0.2867)
4 logl(err) = -2.265063 (0.2652)
5 logl(err) = -1.867822 (0.2407)
6 logl(err) = -1.527532 (0.2125)
7 logl(err) = -1.273630 (0.1895)
8 logl(err) = -1.100501 (0.1750)
9 logl(err) = -0.929080 (0.1473)
10 logl(err) = -0.836051 (0.1346)
11 logl(err) = -0.699707 (0.1198)
12 logl(err) = -0.564398 (0.1001)
13 logl(err) = -0.441807 (0.0828)
14 logl(err) = -0.333912 (0.0659)
15 logl(err) = -0.267858 (0.0533)
16 logl(err) = -0.212053 (0.0424)
17 logl(err) = -0.168572 (0.0373)
18 logl(err) = -0.134340 (0.0298)
19 logl(err) = -0.111698 (0.0244)
20 logl(err) = -0.092270 (0.0206)
21 logl(err) = -0.079662 (0.0187)
22 logl(err) = -0.068819 (0.0163)
23 logl(err) = -0.060227 (0.0151)
24 logl(err) = -0.049325 (0.0139)
25 logl(err) = -0.046275 (0.0130)
26 logl(err) = -0.040652 (0.0124)
27 logl(err) = -0.038172 (0.0122)
28 logl(err) = -0.035171 (0.0119)
29 logl(err) = -0.031924 (0.0117)
30 logl(err) = -0.029920 (0.0115)
31 logl(err) = -0.027669 (0.0112)
32 logl(err) = -0.026677 (0.0112)
33 logl(err) = -0.025601 (0.0114)
34 logl(err) = -0.024310 (0.0110)
35 logl(err) = -0.023345 (0.0110)
36 logl(err) = -0.022622 (0.0109)
37 logl(err) = -0.022118 (0.0109)
38 logl(err) = -0.021542 (0.0109)
39 logl(err) = -0.020366 (0.0108)
40 logl(err) = -0.019907 (0.0108)
41 logl(err) = -0.019527 (0.0108)
42 logl(err) = -0.019157 (0.0108)
43 logl(err) = -0.018908 (0.0107)
44 logl(err) = -0.018660 (0.0108)
45 logl(err) = -0.018500 (0.0108)
46 logl(err) = -0.018362 (0.0108)
47 logl(err) = -0.018135 (0.0107)
48 logl(err) = -0.017941 (0.0107)
49 logl(err) = -0.017780 (0.0107)
50 logl(err) = -0.017540 (0.0107)
51 logl(err) = -0.017320 (0.0107)
52 logl(err) = -0.017120 (0.0107)
53 logl(err) = -0.016975 (0.0107)
54 logl(err) = -0.016897 (0.0106)
55 logl(err) = -0.016744 (0.0106)
56 logl(err) = -0.016676 (0.0106)
57 logl(err) = -0.016596 (0.0106)
58 logl(err) = -0.016566 (0.0106)
59 logl(err) = -0.016524 (0.0106)
60 logl(err) = -0.016484 (0.0106)
61 logl(err) = -0.016452 (0.0106)
62 logl(err) = -0.016419 (0.0106)
63 logl(err) = -0.016388 (0.0106)
64 logl(err) = -0.016368 (0.0106)
65 logl(err) = -0.016305 (0.0106)
66 logl(err) = -0.016264 (0.0106)
67 logl(err) = -0.016230 (0.0106)
68 logl(err) = -0.016215 (0.0106)
69 logl(err) = -0.016191 (0.0106)
70 logl(err) = -0.016168 (0.0106)
71 logl(err) = -0.016140 (0.0106)
72 logl(err) = -0.016120 (0.0106)
73 logl(err) = -0.016099 (0.0106)
74 logl(err) = -0.016085 (0.0106)
75 logl(err) = -0.016074 (0.0106)
76 logl(err) = -0.016060 (0.0106)
77 logl(err) = -0.016054 (0.0106)
78 logl(err) = -0.016039 (0.0106)
79 logl(err) = -0.016029 (0.0106)
80 logl(err) = -0.016022 (0.0106)
Error: cannot allocate vector of size 7.7 Gb
Execution halted
----------------------------------------------------------------------
wva@amcatsql2:~/tmp/ml_french$ cat amcat.R
library(RJDBC)
connect <- function(user, pass) {
# Connect to the AmCAT SQL Server Database
drv <- JDBC("com.microsoft.sqlserver.jdbc.SQLServerDriver",
".:/home/wva/tmp/ml_french/sqljdbc4.jar",
identifier.quote="`")
dbConnect(drv, "jdbc:sqlserver://amcat-sql.vu.nl:1433;DatabaseName=anoko", user, pass)
}
Agenda.getCodedArticles <- function(connection, jobs) {
joblist = paste(jobs, collapse=",")
SQL = "select c.articleid, c.codingjob_articleid, topic as topicid,
cast(substring(label,1,4) as int) as topic, text
from antwerpen_articles_annotations n
inner join codingjobs_articles c on n.codingjob_articleid = c.codingjob_articleid
left join labels l on l.objectid = n.topic and l.languageid=2
left join texts t on t.articleid = c.articleid and t.type=2"
SQL = paste(SQL, "where codingjobid in (", joblist, ")")
data = dbGetQuery(connection, SQL)
data[!is.na(data$topic),]
}
Agenda.getCorpus <- function(train, test, ...) {
data = rbind(train, test)
train.size = 1:(dim(train)[1])
test.size = (dim(train)[1]+1):(dim(data)[1])
td = create_matrix(data$text, ...)
create_corpus(td, data$topic, train.size, test.size)
}