From c719e4575907ada2da675924fd4d5fa0ee33f83b Mon Sep 17 00:00:00 2001
From: vbhavank <bhavan.vasu@kitware.com>
Date: Fri, 14 Aug 2020 12:20:21 -0400
Subject: [PATCH] Fix relevancy cache and normalize

---
 python/smqtk/algorithms/nn_index/faiss.py     | 31 ++++++++++++-------
 .../relevancy_index/logistic_reg.py           | 18 +----------
 2 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/python/smqtk/algorithms/nn_index/faiss.py b/python/smqtk/algorithms/nn_index/faiss.py
index 9a6e9e7d..3f7b52aa 100644
--- a/python/smqtk/algorithms/nn_index/faiss.py
+++ b/python/smqtk/algorithms/nn_index/faiss.py
@@ -29,8 +29,10 @@
 # Requires FAISS bindings
 try:
     import faiss
+    import sklearn
 except ImportError:
     faiss = None
+    sklearn = None
 
 
 class FaissNearestNeighborsIndex (NearestNeighborsIndex):
@@ -50,10 +52,18 @@ def gpu_supported():
         else:
             return False
 
+    @staticmethod
+    def normalize_vec(data, min_range=0, max_range=1):
+        data = sklearn.preprocessing.minmax_scale(X, feature_range=(min_range, max_range), axis=1, copy=False)
+        return data
+       
     @classmethod
     def is_usable(cls):
         # if underlying library is not found, the import above will error
-        return faiss is not None
+        if (faiss is not None) and (sklearn is not None):
+            return True
+        else:
+            return False
 
     @classmethod
     def get_default_config(cls):
@@ -183,6 +193,10 @@ def __init__(self, descriptor_set, idx2uid_kvs, uid2idx_kvs,
             existing index. False by default.
         :type read_only: bool
 
+        :param distsance_m: Key for selecting metric used during indexing
+            and retireval. 'cosine' and 'euclidean' are currently supported   
+        :type distance_m: str
+
         :param factory_string: String to pass to FAISS' `index_factory`;
             see the documentation [1] on this feature for more details.
         :type factory_string: str | unicode
@@ -436,11 +450,9 @@ def _build_index(self, descriptors):
 
         faiss_index = self._index_factory_wrapper(d, self.factory_string)
         # noinspection PyArgumentList
-        if self._distance_metric:
-            data = (
-                data / np.linalg.norm(
-                data, axis=1, keepdims=True)
-            )
+        if self._distance_metric == 'cosine':
+        # Normalizing vector before using L2 will result in cosine distance.
+            data = normalize_vec(data)
         faiss_index.train(data)
         # TODO(john.moeller): This will raise an exception on flat indexes.
         # There's a solution which involves wrapping the index in an
@@ -651,11 +663,8 @@ def _nn(self, d, n=1):
 
         """
         q = d.vector()[np.newaxis, :].astype(np.float32)
-        if self._distance_metric:
-            q = (
-                q / np.linalg.norm(
-                q, axis=1, keepdims=True)
-            )
+        if self._distance_metric == 'cosine':
+            q = normalize_vec(q)
         self._log.debug("Received query for %d nearest neighbors", n)
 
         with self._model_lock:
diff --git a/python/smqtk/algorithms/relevancy_index/logistic_reg.py b/python/smqtk/algorithms/relevancy_index/logistic_reg.py
index 068d332a..682ce81d 100644
--- a/python/smqtk/algorithms/relevancy_index/logistic_reg.py
+++ b/python/smqtk/algorithms/relevancy_index/logistic_reg.py
@@ -23,9 +23,6 @@ class LogisticRegRelevancyIndex (RelevancyIndex):
     to implement IQR ranking.
     """
 
-    # Dictionary of parameter/value pairs that will be passed to libSVM during
-    # the model trail phase. Parameters that are flags, i.e. have no values,
-    # should be given an empty string ('') value.
     LR_TRAIN_PARAMS = {
         "penalty": "l2",
         "dual": True,  
@@ -47,16 +44,13 @@ def is_usable(cls):
         """
         return LogisticRegression and sklearn
 
-    def __init__(self, descr_cache_filepath=None, autoneg_select_ratio=1,
+    def __init__(self, autoneg_select_ratio=1,
                  multiprocess_fetch=False, cores=None):
         """
         Initialize a new or existing index.
         TODO ::
         - input optional known background descriptors, i.e. descriptors for
             things that would otherwise always be considered a negative example.
-        :param descr_cache_filepath: Optional path to store/load descriptors
-            we index.
-        :type descr_cache_filepath: None | str
         :param autoneg_select_ratio: Number of maximally distant descriptors to
             select from our descriptor cache for each positive example provided
             when no negative examples are provided for ranking.
@@ -73,7 +67,6 @@ def __init__(self, descr_cache_filepath=None, autoneg_select_ratio=1,
         """
         super(LogisticRegRelevancyIndex, self).__init__()
 
-        self.descr_cache_fp = descr_cache_filepath
         self.autoneg_select_ratio = int(autoneg_select_ratio)
         self.multiprocess_fetch = multiprocess_fetch
         self.cores = cores
@@ -87,20 +80,12 @@ def __init__(self, descr_cache_filepath=None, autoneg_select_ratio=1,
         # subsequently in the distance kernel
         self._descr2index = {}
 
-        if self.descr_cache_fp and osp.exists(self.descr_cache_fp):
-            with open(self.descr_cache_fp, 'rb') as f:
-                descriptors = pickle.load(f)
-                self.descr_cache_fp = None
-                self.build_index(descriptors)
-                self.descr_cache_fp = descr_cache_filepath
-
     @classmethod
     def _gen_lr_parameter_string(cls):
         return cls.LR_TRAIN_PARAMS
     
     def get_config(self):
         return {
-            "descr_cache_filepath": self.descr_cache_fp,
             'autoneg_select_ratio': self.autoneg_select_ratio,
             'multiprocess_fetch': self.multiprocess_fetch,
             'cores': self.cores,
@@ -149,7 +134,6 @@ def get_vector(d_elem):
             self._descr2index[tuple(v)] = i
         self._descr_matrix = numpy.array(self._descr_matrix)
         
-
     def rank(self, pos, neg):
         """
         Rank the currently indexed elements given ``pos`` positive and ``neg``