I want to fit a mixed-logit for SP/RP data.
As the RP data misses all the socio-demographics and has no variation in
the choice variables, I want to do this in two steps: Estimate an SP only model (this works fine), fix everything but the ASCs and the scale parameter and re-estimate with the RP data.
The SP data is panel data, but the RP data has only one observation per participant.
When running, I get the following error. It seems the database is flattened repeatedly until it fails. Can you tell what the problem is?
Thank you very much and best regards,
Matthias
Biogeme parameters read from biogeme.toml.
Flattening database [(14297, 23)].
Database flattened [(11414, 107)]
Flattening database [(14297, 23)].
Database flattened [(11414, 107)]
Flattening database [(14297, 23)].
Database flattened [(11414, 107)]
Flattening database [(14297, 23)].
Database flattened [(11414, 107)]
---------------------------------------------------------------------------
BiogemeError Traceback (most recent call last)
Cell In[5], line 114
111 conditional_trajectory_probability = PanelLikelihoodTrajectory(choice_probability_one_observation)
112 log_probability = log(MonteCarlo(conditional_trajectory_probability))
--> 114 the_biogeme = BIOGEME(
115 database,
116 log_probability,
117 number_of_draws=2000,
118 number_of_threads=1,
119 seed=12345,
120 use_jit=False,
121 )
122 the_biogeme.model_name = 'combined_panel_with_RPscale'
124 # Estimate
File ~/Documents/Biogeme/.venv/lib64/python3.12/site-packages/biogeme/biogeme.py:210, in BIOGEME.__init__(self, database, formulas, random_number_generators, user_notes, parameters, **kwargs)
206 self.null_loglikelihood = None #: Log likelihood of the null model
208 self.best_iteration = None #: Store the best iteration found so far.
--> 210 self._model_elements = ModelElements(
211 expressions=self.formulas,
212 database=self.database,
213 number_of_draws=self.biogeme_parameters.get_value(name='number_of_draws'),
214 user_defined_draws=random_number_generators,
215 use_jit=self.biogeme_parameters.get_value(name='use_jit'),
...
24 logger.warning(warning)
25 if audit_tuple.errors:
---> 26 raise BiogemeError(audit_tuple.errors)
BiogemeError: ['Variable "rp__panel__01__panel__01__panel__01__panel__01" not found in the database.
This is the full code:
# STEP 3 — COMBINED SP+RP (PANEL) WITH RP SCALE
import numpy as np
import pandas as pd
import biogeme.biogeme_logging as blog
from biogeme.biogeme import BIOGEME
from biogeme.database import Database
from biogeme.expressions import Beta, Draws, Variable, MonteCarlo, PanelLikelihoodTrajectory, log
from biogeme.models import logit
from biogeme.results_processing import EstimationResults, get_pandas_estimated_parameters
from IPython.display import display
# -------- 0) Logging / reproducibility (suppress verbose flatten logs) --------
logger = blog.get_screen_logger(level=blog.ERROR)
np.random.seed(seed=12345)
# -------- 1) Load raw SP+RP data --------
df = pd.read_csv('cov_df.csv', sep=';', na_values=['NA', 'NaN', ''])
needed = [
'RID','pref1',
'price_opt1','price_opt2','price_opt3',
'avail_opt1','avail_opt2','avail_opt3',
'opt3',
'ptsub','gender','city_center',
'category_1','category_2','category_3','category_4','category_5','category_6',
'age_cat_18_29','age_cat_30_44','age_cat_45_64','age_cat_65',
'rp',
]
for c in needed:
if c not in df.columns:
df[c] = 0
df['RID'] = pd.to_numeric(df['RID'], errors='coerce').astype(int)
num_cols = [c for c in needed if c != 'RID']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
df[['avail_opt1','avail_opt2','avail_opt3']] = df[['avail_opt1','avail_opt2','avail_opt3']].astype(int)
df['pref1'] = df['pref1'].astype(int)
df['rp'] = df['rp'].astype(int)
# -------- 2) Build panel database (RP panels have length 1) --------
database = Database('bike_subscriptions_combined_panel', df)
database.panel('RID')
# -------- 3) Declare variables--------
CHOICE = Variable('pref1')
AV1, AV2, AV3, AV4 = Variable('avail_opt1'), Variable('avail_opt2'), Variable('avail_opt3'), 1
P1 = Variable('price_opt1') / 10.0
P2 = Variable('price_opt2') / 10.0
P3 = Variable('price_opt3') / 10.0
OPT3MIN = Variable('opt3')
def CV(name): # row-level covariates
return Variable(name)
# - drop category_6
# - drop age_cat_65
bin_covars = ['ptsub','gender','city_center']
cat_covars = ['category_1','category_2','category_3','category_4','category_5'] # ref: category_6
age_covars = ['age_cat_18_29','age_cat_30_44','age_cat_45_64'] # ref: age_cat_65
covars = bin_covars + cat_covars + age_covars
# -------- 4) Load Step 1 estimates, set free/fixed status --------
try:
beta_vals = results_sp.get_beta_values() # if Step 1 ran in this kernel
except Exception:
# fallback: load from file saved in Step 1
results_sp = EstimationResults.from_yaml_file(
filename='saved_results/step2_sp_panel_mixedlogit_random_ASCs_refdummies.yaml'
)
beta_vals = results_sp.get_beta_values()
# ASC means
ASC1_mu = Beta('ASC1_mu', beta_vals['ASC1_mu'], None, None, 0)
ASC2_mu = Beta('ASC2_mu', beta_vals['ASC2_mu'], None, None, 0)
ASC3_mu = Beta('ASC3_mu', beta_vals['ASC3_mu'], None, None, 0)
# ASC sigmas (FIXED when adding RP + scale)
ASC1_sig = Beta('ASC1_sig', beta_vals['ASC1_sig'], 0, None, 1)
ASC2_sig = Beta('ASC2_sig', beta_vals['ASC2_sig'], 0, None, 1)
ASC3_sig = Beta('ASC3_sig', beta_vals['ASC3_sig'], 0, None, 1)
# Random ASCs (same draw type as Step 1)
ASC1_rnd = ASC1_mu + ASC1_sig * Draws('ASC1_draw', 'NORMAL_ANTI')
ASC2_rnd = ASC2_mu + ASC2_sig * Draws('ASC2_draw', 'NORMAL_ANTI')
ASC3_rnd = ASC3_mu + ASC3_sig * Draws('ASC3_draw', 'NORMAL_ANTI')
# All other parameters FIXED to Step 1 estimates
B_price = Beta('B_price', beta_vals['B_price'], None, None, 1)
betas_cov = {(cv,k): Beta(f'B_{cv}_opt{k}', beta_vals.get(f'B_{cv}_opt{k}', 0.0), None, None, 1)
for k in [1,2,3] for cv in covars}
B_opt3_minutes = Beta('B_opt3_minutes', beta_vals['B_opt3_minutes'], None, None, 1)
# -------- 5) RP scale (SP scale = 1). --------
lambda_rp = Beta('lambda_rp', 1.0, 1e-6, None, 0)
Scale = 1 + Variable('rp') * (lambda_rp - 1) # 1 for SP rows, λ_RP for RP rows
# -------- 6) Utilities (scaled) --------
V1 = ASC1_rnd + B_price * P1 + sum(betas_cov[(cv,1)] * CV(cv) for cv in covars)
V2 = ASC2_rnd + B_price * P2 + sum(betas_cov[(cv,2)] * CV(cv) for cv in covars)
V3 = ASC3_rnd + B_price * P3 + B_opt3_minutes * OPT3MIN + sum(betas_cov[(cv,3)] * CV(cv) for cv in covars)
V4 = 0
v = {1: Scale * V1, 2: Scale * V2, 3: Scale * V3, 4: Scale * V4}
av = {1: AV1, 2: AV2, 3: AV3, 4: AV4}
# -------- 7) Panel likelihood trajectory + Monte Carlo--------
choice_probability_one_observation = logit(v, av, CHOICE)
conditional_trajectory_probability = PanelLikelihoodTrajectory(choice_probability_one_observation)
log_probability = log(MonteCarlo(conditional_trajectory_probability))
the_biogeme = BIOGEME(
database,
log_probability,
number_of_draws=2000,
number_of_threads=1,
seed=12345,
use_jit=False,
)
the_biogeme.model_name = 'combined_panel_with_RPscale'
# Estimate
try:
results_combined = EstimationResults.from_yaml_file(
filename='combined_panel_with_RPscale.yaml'
)
except FileNotFoundError:
results_combined = the_biogeme.estimate()
print(results_combined.short_summary())
display(get_pandas_estimated_parameters(estimation_results=results_combined))