UDST · smmaurer · Oct 3, 2018 · Sep 12, 2018 · Sep 12, 2018 · Sep 12, 2018
diff --git a/choicemodels/__init__.py b/choicemodels/__init__.py
@@ -4,4 +4,4 @@
 from .choicemodels import *
 from .mnl import MultinomialLogit, MultinomialLogitResults
 
-version = __version__ = '0.2.dev2'
+version = __version__ = '0.2.dev3'
diff --git a/choicemodels/mnl.py b/choicemodels/mnl.py
@@ -83,7 +83,7 @@ class MultinomialLogit(object):
     Parameters
     ----------
 
-    data : pandas.DataFrame or choicemodels.tools.MergedChoiceTable
+    data : pd.DataFrame or choicemodels.tools.MergedChoiceTable
         A table of estimation data in "long" format, with one row for each combination of
         chooser and alternative. Column labeling must be consistent with the
         'model_expression'. May include extra columns.
@@ -120,7 +120,7 @@ class MultinomialLogit(object):
         the model expression varies for different alternatives. Not required if data is 
         passed as a MergedChoiceTable.
 
-    initial_coefs : float, list, or 1D array, optional
+    initial_coefs : numeric or list-like of numerics, optional
         Initial coefficients (beta values) to begin the optimization process with. Provide
         a single value for all coefficients, or an array containing a value for each
         one being estimated. If None, initial coefficients will be 0.
@@ -222,8 +222,8 @@ def fit(self):
                                             model_type = 'MNL')
 
             m.fit_mle(init_vals = self._initial_coefs)
-            results = MultinomialLogitResults(self._estimation_engine,
-                                              self._model_expression,
+            results = MultinomialLogitResults(estimation_engine = self._estimation_engine,
+                                              model_expression = self._model_expression,
                                               results = m)
 
         elif (self._estimation_engine == 'ChoiceModels'):
@@ -241,8 +241,8 @@ def fit(self):
                                  fit_parameters = fit,
                                  x_names = model_design.design_info.column_names)
 
-            results = MultinomialLogitResults(self._estimation_engine,
-                                              self._model_expression,
+            results = MultinomialLogitResults(estimation_engine = self._estimation_engine,
+                                              model_expression = self._model_expression,
                                               results = result_params)
 
         return results
@@ -268,9 +268,6 @@ class MultinomialLogitResults(object):
 
     Parameters
     ----------
-    estimation_engine : str
-        'ChoiceModels' or 'PyLogit'. # TO DO - infer from model_expression?
-
     model_expression : str or OrderedDict
         Patsy 'formula-like' (str) or PyLogit 'specification' (OrderedDict).
 
@@ -281,9 +278,12 @@ class MultinomialLogitResults(object):
     fitted_parameters : list of floats, optional
         If not provided, these will be extracted from the raw results.
 
+    estimation_engine : str, optional
+        'ChoiceModels' (default) or 'PyLogit'.  # TO DO - infer from model_expression?
+
     """
-    def __init__(self, estimation_engine, model_expression, results=None, 
-                 fitted_parameters=None):
+    def __init__(self, model_expression, results=None, fitted_parameters=None, 
+                 estimation_engine='ChoiceModels'):
 
         if (fitted_parameters is None) & (results is not None):
             if (estimation_engine == 'ChoiceModels'):
@@ -336,7 +336,7 @@ def probabilities(self, data):
         df = data.to_frame()
         numalts = data.sample_size  # TO DO - make this an official MCT param
 
-        dm = dmatrix(self.model_expression, data=df, return_type='dataframe')
+        dm = dmatrix(self.model_expression, data=df)
 
         # utility is sum of data values times fitted betas
         u = np.dot(self.fitted_parameters, np.transpose(dm))
@@ -354,7 +354,7 @@ def probabilities(self, data):
         probs = exponentiated_utility / sum_exponentiated_utility
 
         # convert back to ordering of the input data
-        probs = np.reshape(np.transpose(probs), (probs.size, 1))
+        probs = probs.flatten(order='F')
 
         df['prob'] = probs  # adds indexes
         return df.prob

diff --git a/choicemodels/tools/__init__.py b/choicemodels/tools/__init__.py
@@ -2,3 +2,4 @@
 # See full license in LICENSE
 
 from .mergedchoicetable import *
+from .simulation import *
diff --git a/choicemodels/tools/mergedchoicetable.py b/choicemodels/tools/mergedchoicetable.py
@@ -3,18 +3,27 @@
 sampling functionality.
 
 """
-import random
-import sys
-
 import numpy as np
 import pandas as pd
 
 
 class MergedChoiceTable(object):
     """
-    Generates a merged long-format table of choosers and alternatives, for discrete choice
-    model estimation or simulation. 
+    Generates a merged long-format table of observations (choosers) and alternatives, for 
+    discrete choice model estimation or simulation. 
+
+    Supports random sampling of alternatives (uniform or weighted). Supports sampling with
+    or without replacement. Supports merging observations and alternatives without 
+    sampling them. Supports alternative-specific weights, as well as interaction weights
+    that depend on both the observation and alternative. Supports automatic merging of 
+    interaction terms onto the final data table.
 
+    Support is PLANNED for specifying availability of alternatives, specifying random 
+    state, and passing interaction-type parameters as callable generator functions.
+
+    Does NOT support cases where the number of alternatives in the final table varies 
+    across observations.
+
     Reserved column names: 'chosen'.
 
     Parameters

diff --git a/choicemodels/tools/simulation.py b/choicemodels/tools/simulation.py
@@ -0,0 +1,78 @@
+"""
+Utilities for Monte Carlo simulation of choices.
+
+"""
+import numpy as np
+import pandas as pd
+
+
+def monte_carlo_choices(probabilities):
+    """
+    Monte Carlo simulation of choices for a set of K scenarios, each having different
+    probability distributions (and potentially different alternatives). 
+
+    Choices are independent and unconstrained, meaning that the same alternative can be 
+    chosen in multiple scenarios.
+
+    This function is equivalent to applying np.random.choice() to each of the K scenarios,
+    but it's implemented as a single-pass matrix calculation. This is about 50x faster
+    than using df.apply() or a loop. 
+
+    If all the choice scenarios have the same probability distribution among alternatives,
+    you don't need this function. You can use np.random.choice() with size=K, which will 
+    be more efficient. (For example, that would work for a choice model whose expression 
+    includes only attributes of the alternatives.)
+
+    NOTE ABOUT THE INPUT FORMATS: It's important for the probabilities to be structured
+    correctly. This is computationally expensive to verify, so you will not get a warning
+    if it's wrong! (TO DO: we should provide an option to perform these checks, though)
+
+    1. Probabilities (pd.Series) must include a two-level MultiIndex, the first level 
+       representing the scenario (observation) id and the second the alternative id.
+
+    2. Probabilities must be sorted so that each scenario's alternatives are consecutive.
+
+    3. Each scenario must have the same number of alternatives. You can pad a scenario 
+       with zero-probability alternatives if needed.
+
+    4. Each scenario's alternative probabilities must sum to 1. 
+
+    Parameters
+    ----------
+    probabilities: pd.Series
+        List of probabilities for each observation (choice scenario) and alternative. 
+        Please verify that the formatting matches the four requirements described above.
+
+    Returns
+    -------
+    pd.Series
+        List of chosen alternative id's, indexed with the observation id.
+
+    """
+    # TO DO - if input is a single-column dataframe, silently convert it to series
+
+    obs_name, alts_name = probabilities.index.names
+
+    obs = probabilities.index.get_level_values(0)
+    alts = probabilities.index.get_level_values(1)
+
+    num_obs = obs.unique().size
+    num_alts = probabilities.size // num_obs
+
+    # This Monte Carlo approach is adapted from urbansim.urbanchoice.mnl_simulate()
+    probs = np.array(probabilities)
+    cumprobs = probs.reshape((num_obs, num_alts)).cumsum(axis=1)
+
+    # Simulate choice by subtracting a random float
+    scaledprobs = np.subtract(cumprobs, np.random.rand(num_obs, 1))
+
+    # Replace negative values with 0 and positive values with 1, then use argmax to
+    # return the position of the first postive value
+    choice_ix = np.argmax((scaledprobs + 1.0).astype('i4'), axis=1)
+    choice_ix_1d = choice_ix + (np.arange(num_obs) * num_alts)
+
+    choices = pd.DataFrame({obs_name: obs.values.take(choice_ix_1d),
+                            alts_name: alts.values.take(choice_ix_1d)})
+
+    return choices.set_index(obs_name)[alts_name]
+
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 
 setup(
     name='choicemodels',
-    version='0.2.dev2',
+    version='0.2.dev3',
     description='Tools for discrete choice estimation',
     long_description=long_description,
     author='UDST',

diff --git a/tests/test_mnl_new.py b/tests/test_mnl_new.py
@@ -41,7 +41,7 @@ def test_prediction():
                              numalts=mct.sample_size, returnprobs=True)
 
     df = mct.to_frame()
-    df['prob'] = np.reshape(probs, (probs.size, 1))
+    df['prob'] = probs.flatten()
     probs2 = df.prob
 
     pd.testing.assert_series_equal(probs1, probs2)
diff --git a/tests/test_simulation.py b/tests/test_simulation.py
@@ -0,0 +1,67 @@
+"""
+Tests for the simulation codebase.
+
+"""
+from __future__ import division
+
+import numpy as np
+import pandas as pd
+import pytest
+
+import choicemodels
+
+
+# TO DO - could we set a random seed and then verify that monte_carlo_choices() provides
+# the same output as np.random.choice()?
+
+def build_data(num_obs, num_alts):
+    """
+    Build a simulated list of scenarios, alternatives, and probabilities
+
+    """
+    obs = np.repeat(np.arange(num_obs), num_alts)
+    alts = np.random.randint(0, num_alts*10, size=num_obs*num_alts)
+
+    weights = np.random.rand(num_alts, num_obs)
+    probs = weights / weights.sum(axis=0)
+    probslist = probs.flatten(order='F')
+
+    data = pd.DataFrame({'oid': obs, 'aid': alts, 'probs': probslist})
+    data = data.set_index(['oid','aid']).probs
+    return data
+
+
+def test_unconstrained_simulation():
+    """
+    Test simulation of choices without capacity constraints. This test just verifies that
+    the code runs, using a fairly large synthetic dataset.
+
+    """
+    data = build_data(1000, 100)
+    choicemodels.tools.monte_carlo_choices(data)
+
+
+def test_simulation_accuracy():
+    """
+    This test checks that the simulation tool is generating choices that match the 
+    provided probabilities. 
+
+    """
+    data = build_data(5,3)
+
+    # Get values associated with an arbitrary row
+    r = np.random.randint(0, 15, 1)
+    row = pd.DataFrame(data).reset_index().iloc[r]
+    oid = int(row.oid)
+    aid = int(row.aid)
+    prob = float(pd.DataFrame(data).query('oid=='+str(oid)+' & aid=='+str(aid)).sum())
+
+    n = 1000
+    count = 0
+    for i in range(n):
+        choices = choicemodels.tools.monte_carlo_choices(data)
+        if (choices.loc[oid] == aid):
+            count += 1
+
+    assert(count/n > prob-0.1)
+    assert(count/n < prob+0.1)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@
		# See full license in LICENSE

		from .mergedchoicetable import *
		from .simulation import *