Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[0.2.dev3] Choice simulation without capacity constraints #43

Merged
merged 15 commits into from
Oct 3, 2018
2 changes: 1 addition & 1 deletion choicemodels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .choicemodels import *
from .mnl import MultinomialLogit, MultinomialLogitResults

version = __version__ = '0.2.dev2'
version = __version__ = '0.2.dev3'
26 changes: 13 additions & 13 deletions choicemodels/mnl.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class MultinomialLogit(object):
Parameters
----------

data : pandas.DataFrame or choicemodels.tools.MergedChoiceTable
data : pd.DataFrame or choicemodels.tools.MergedChoiceTable
A table of estimation data in "long" format, with one row for each combination of
chooser and alternative. Column labeling must be consistent with the
'model_expression'. May include extra columns.
Expand Down Expand Up @@ -120,7 +120,7 @@ class MultinomialLogit(object):
the model expression varies for different alternatives. Not required if data is
passed as a MergedChoiceTable.

initial_coefs : float, list, or 1D array, optional
initial_coefs : numeric or list-like of numerics, optional
Initial coefficients (beta values) to begin the optimization process with. Provide
a single value for all coefficients, or an array containing a value for each
one being estimated. If None, initial coefficients will be 0.
Expand Down Expand Up @@ -222,8 +222,8 @@ def fit(self):
model_type = 'MNL')

m.fit_mle(init_vals = self._initial_coefs)
results = MultinomialLogitResults(self._estimation_engine,
self._model_expression,
results = MultinomialLogitResults(estimation_engine = self._estimation_engine,
model_expression = self._model_expression,
results = m)

elif (self._estimation_engine == 'ChoiceModels'):
Expand All @@ -241,8 +241,8 @@ def fit(self):
fit_parameters = fit,
x_names = model_design.design_info.column_names)

results = MultinomialLogitResults(self._estimation_engine,
self._model_expression,
results = MultinomialLogitResults(estimation_engine = self._estimation_engine,
model_expression = self._model_expression,
results = result_params)

return results
Expand All @@ -268,9 +268,6 @@ class MultinomialLogitResults(object):

Parameters
----------
estimation_engine : str
'ChoiceModels' or 'PyLogit'. # TO DO - infer from model_expression?

model_expression : str or OrderedDict
Patsy 'formula-like' (str) or PyLogit 'specification' (OrderedDict).

Expand All @@ -281,9 +278,12 @@ class MultinomialLogitResults(object):
fitted_parameters : list of floats, optional
If not provided, these will be extracted from the raw results.

estimation_engine : str, optional
'ChoiceModels' (default) or 'PyLogit'. # TO DO - infer from model_expression?

"""
def __init__(self, estimation_engine, model_expression, results=None,
fitted_parameters=None):
def __init__(self, model_expression, results=None, fitted_parameters=None,
estimation_engine='ChoiceModels'):

if (fitted_parameters is None) & (results is not None):
if (estimation_engine == 'ChoiceModels'):
Expand Down Expand Up @@ -336,7 +336,7 @@ def probabilities(self, data):
df = data.to_frame()
numalts = data.sample_size # TO DO - make this an official MCT param

dm = dmatrix(self.model_expression, data=df, return_type='dataframe')
dm = dmatrix(self.model_expression, data=df)

# utility is sum of data values times fitted betas
u = np.dot(self.fitted_parameters, np.transpose(dm))
Expand All @@ -354,7 +354,7 @@ def probabilities(self, data):
probs = exponentiated_utility / sum_exponentiated_utility

# convert back to ordering of the input data
probs = np.reshape(np.transpose(probs), (probs.size, 1))
probs = probs.flatten(order='F')

df['prob'] = probs # adds indexes
return df.prob
Expand Down
1 change: 1 addition & 0 deletions choicemodels/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
# See full license in LICENSE

from .mergedchoicetable import *
from .simulation import *
19 changes: 14 additions & 5 deletions choicemodels/tools/mergedchoicetable.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,27 @@
sampling functionality.

"""
import random
import sys

import numpy as np
import pandas as pd


class MergedChoiceTable(object):
"""
Generates a merged long-format table of choosers and alternatives, for discrete choice
model estimation or simulation.
Generates a merged long-format table of observations (choosers) and alternatives, for
discrete choice model estimation or simulation.

Supports random sampling of alternatives (uniform or weighted). Supports sampling with
or without replacement. Supports merging observations and alternatives without
sampling them. Supports alternative-specific weights, as well as interaction weights
that depend on both the observation and alternative. Supports automatic merging of
interaction terms onto the final data table.

Support is PLANNED for specifying availability of alternatives, specifying random
state, and passing interaction-type parameters as callable generator functions.

Does NOT support cases where the number of alternatives in the final table varies
across observations.

Reserved column names: 'chosen'.

Parameters
Expand Down
78 changes: 78 additions & 0 deletions choicemodels/tools/simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Utilities for Monte Carlo simulation of choices.

"""
import numpy as np
import pandas as pd


def monte_carlo_choices(probabilities):
"""
Monte Carlo simulation of choices for a set of K scenarios, each having different
probability distributions (and potentially different alternatives).

Choices are independent and unconstrained, meaning that the same alternative can be
chosen in multiple scenarios.

This function is equivalent to applying np.random.choice() to each of the K scenarios,
but it's implemented as a single-pass matrix calculation. This is about 50x faster
than using df.apply() or a loop.

If all the choice scenarios have the same probability distribution among alternatives,
you don't need this function. You can use np.random.choice() with size=K, which will
be more efficient. (For example, that would work for a choice model whose expression
includes only attributes of the alternatives.)

NOTE ABOUT THE INPUT FORMATS: It's important for the probabilities to be structured
correctly. This is computationally expensive to verify, so you will not get a warning
if it's wrong! (TO DO: we should provide an option to perform these checks, though)

1. Probabilities (pd.Series) must include a two-level MultiIndex, the first level
representing the scenario (observation) id and the second the alternative id.

2. Probabilities must be sorted so that each scenario's alternatives are consecutive.

3. Each scenario must have the same number of alternatives. You can pad a scenario
with zero-probability alternatives if needed.

4. Each scenario's alternative probabilities must sum to 1.

Parameters
----------
probabilities: pd.Series
List of probabilities for each observation (choice scenario) and alternative.
Please verify that the formatting matches the four requirements described above.

Returns
-------
pd.Series
List of chosen alternative id's, indexed with the observation id.

"""
# TO DO - if input is a single-column dataframe, silently convert it to series

obs_name, alts_name = probabilities.index.names

obs = probabilities.index.get_level_values(0)
alts = probabilities.index.get_level_values(1)

num_obs = obs.unique().size
num_alts = probabilities.size // num_obs

# This Monte Carlo approach is adapted from urbansim.urbanchoice.mnl_simulate()
probs = np.array(probabilities)
cumprobs = probs.reshape((num_obs, num_alts)).cumsum(axis=1)

# Simulate choice by subtracting a random float
scaledprobs = np.subtract(cumprobs, np.random.rand(num_obs, 1))

# Replace negative values with 0 and positive values with 1, then use argmax to
# return the position of the first postive value
choice_ix = np.argmax((scaledprobs + 1.0).astype('i4'), axis=1)
choice_ix_1d = choice_ix + (np.arange(num_obs) * num_alts)

choices = pd.DataFrame({obs_name: obs.values.take(choice_ix_1d),
alts_name: alts.values.take(choice_ix_1d)})

return choices.set_index(obs_name)[alts_name]

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name='choicemodels',
version='0.2.dev2',
version='0.2.dev3',
description='Tools for discrete choice estimation',
long_description=long_description,
author='UDST',
Expand Down
2 changes: 1 addition & 1 deletion tests/test_mnl_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_prediction():
numalts=mct.sample_size, returnprobs=True)

df = mct.to_frame()
df['prob'] = np.reshape(probs, (probs.size, 1))
df['prob'] = probs.flatten()
probs2 = df.prob

pd.testing.assert_series_equal(probs1, probs2)
67 changes: 67 additions & 0 deletions tests/test_simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Tests for the simulation codebase.

"""
from __future__ import division

import numpy as np
import pandas as pd
import pytest

import choicemodels


# TO DO - could we set a random seed and then verify that monte_carlo_choices() provides
# the same output as np.random.choice()?

def build_data(num_obs, num_alts):
"""
Build a simulated list of scenarios, alternatives, and probabilities

"""
obs = np.repeat(np.arange(num_obs), num_alts)
alts = np.random.randint(0, num_alts*10, size=num_obs*num_alts)

weights = np.random.rand(num_alts, num_obs)
probs = weights / weights.sum(axis=0)
probslist = probs.flatten(order='F')

data = pd.DataFrame({'oid': obs, 'aid': alts, 'probs': probslist})
data = data.set_index(['oid','aid']).probs
return data


def test_unconstrained_simulation():
"""
Test simulation of choices without capacity constraints. This test just verifies that
the code runs, using a fairly large synthetic dataset.

"""
data = build_data(1000, 100)
choicemodels.tools.monte_carlo_choices(data)


def test_simulation_accuracy():
"""
This test checks that the simulation tool is generating choices that match the
provided probabilities.

"""
data = build_data(5,3)

# Get values associated with an arbitrary row
r = np.random.randint(0, 15, 1)
row = pd.DataFrame(data).reset_index().iloc[r]
oid = int(row.oid)
aid = int(row.aid)
prob = float(pd.DataFrame(data).query('oid=='+str(oid)+' & aid=='+str(aid)).sum())

n = 1000
count = 0
for i in range(n):
choices = choicemodels.tools.monte_carlo_choices(data)
if (choices.loc[oid] == aid):
count += 1

assert(count/n > prob-0.1)
assert(count/n < prob+0.1)