library(magrittr)
library(dplyr)
library(lavaan)
#' Okay, so, let's start with a basic 1-factor SEM/CFA off the iris data set. We're
#' going to set up a categorical variable but treat it as numeric to start. What
#' I'm expecting (and please correct me if this expectation is incorrect) is that
#' lavPredict could score the original training set of any SEM model and reproduce
#' the latent scores as if the entire training set was scored. And I start by
#' showing that this is true for the all-numeric case.
df <- iris %>%
mutate(species_fac = as.numeric(factor(Species))) %>%
select(-Species)
mdl <- sem(
'factor =~ Petal.Length + Petal.Width + species_fac'
, data = df
)
score_from_training <- lavPredict(mdl)[150,]
# The goal is to score a new record. We can't do that directly - Lavaan throws
# an error here even as numeric, species_fac has no variance.
lavPredict(mdl, newdata = df[150, ])
# One suggestion was to try appending new records to the original training set.
# This produces the EETAx error. For lavPredict, newdata cannot be larger than
# the original training set.
lavPredict(mdl, newdata = bind_rows(df, df[150, ]))
#okay so that's fine, I want to score just a single row. Let's artificially
#inject some variance by creating a synthetic row. We'll score both but keep
#only the actual data
synth <- data.frame(
Sepal.Length = 1
,Sepal.Width = 1
,Petal.Length = 1
,Petal.Width = 1
,species_fac = 1
) %>% bind_rows(df[150,])
possible_new_record <- lavPredict(mdl, newdata = synth)[2,]
# In the numeric case, this works. I get an exact match.
score_from_training == possible_new_record
# But this is not the case for the model with ordered observables.
mdl <- sem(
'factor =~ Petal.Length + Petal.Width + species_fac'
, data = df
, ordered = 'species_fac'
)
score_from_training <- lavPredict(mdl)[150,]
possible_new_record <- lavPredict(mdl, newdata = synth)[2,]
# Very different scores
score_from_training == possible_new_record
```