For me, docount2 is about 1.7x faster than docount1 (on my old laptop, 28s vs
47s) for your "Hungarian Wikipedia" test dataset. We might want to implement
some of these tweaks in the standard library.
--Tim
import Base: start, next, done, eltype, readuntil
function docount1(io)
wc = Dict{AbstractString,Int64}()
for l in eachline(io)
for w in split(l)
wc[w]=get(wc, w, 0) + 1
end
end
wc
end
type EachLn{T}
stream::IO
end
start(itr::EachLn) = nothing
function done(itr::EachLn, nada)
if !eof(itr.stream)
return false
end
true
end
next{T}(itr::EachLn{T}, nada) = (readuntil(T, itr.stream, '\n'), nothing)
eltype{T}(::Type{EachLn{T}}) = T
function readuntil{T}(::Type{T}, s::IO, delim::Char)
if delim < Char(0x80)
data = readuntil(s, delim%UInt8)
return T(data)
end
out = IOBuffer()
while !eof(s)
c = read(s, Char)
write(out, c)
if c == delim
break
end
end
T(takebuf_array(out))
end
function docount2(io)
wc = Dict{SubString{UTF8String},Int64}()
for l in EachLn{UTF8String}(io)
for w in split(l)
wc[w]=get(wc, w, 0) + 1
end
end
wc
end