I found it convenient in some cases to flatten categorical variables into separate new columns instead of wrapping them into PooledDataArray or CategoricalVariables. Here's some functions for doing this:
function getdummy{R}(df::DataFrame, cname::Symbol, ::Type{R})
darr = df[cname]
vals = sort(levels(darr))[2:end]
namedict = Dict(vals, 1:length(vals))
arr = zeros(R, length(darr), length(namedict))
for i=1:length(darr)
if haskey(namedict, darr[i])
arr[i, namedict[darr[i]]] = 1
end
end
newdf = convert(DataFrame, arr)
names!(newdf, [symbol("$(cname)_$k") for k in vals])
return newdf
end
function convertdummy{R}(df::DataFrame, cnames::Array{Symbol}, ::Type{R})
# consider every variable from cnames as categorical
# and convert them into set of dummy variables,
# return new dataframe
newdf = DataFrame()
for cname in names(df)
if !in(cname, cnames)
newdf[cname] = df[cname]
else
dummydf = getdummy(df, cname, R)
for dummyname in names(dummydf)
newdf[dummyname] = dummydf[dummyname]
end
end
end
return newdf
end
convertdummy(df::DataFrame, cnames::Array{Symbol}) = convertdummy(df, cnames, Int32)