informagi · oestebanbajo · Nov 15, 2022 · Nov 22, 2022 · Nov 22, 2022 · Jan 31, 2023
diff --git a/.gitignore b/.gitignore
@@ -51,6 +51,12 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 
+#
+/geesedb/tests/resources/WP
+/geesedb/tests/test_script.py
+/geesedb/tests/generated_dbs
+/geesedb/resources/nltk_data
+
 # Translations
 *.mo
 *.pot

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ pytest
 ```
 
 ## How do I index?
+### CSV files
 The fastest way to load text data into GeeseDB is through CSV files. There should be three csv files: one for terms, one for documents, and one that connects the terms to the documents. Small examples of these files can be found in the repository: [docs.csv](./geesedb/tests/resources/csv/example_docs.csv), [terms_dics.csv](./geesedb/tests/resources/csv/example_term_dict.csv), and [term_doc.csv](./geesedb/tests/resources/csv/example_term_doc.csv).
 
 These can be generated using the CIFF [to_csv](./geesedb/utils/ciff/to_csv.py) class from [CIFF](https://github.com/osirrc/ciff) collections, or you can create them however you like. The documents can be loaded using the following code:
@@ -46,6 +47,33 @@ index = FullTextFromCSV(
 index.load_data()
 ```
 
+### JSONL file
+Another way to load data into GeeseDB is through a JSONL file. To index a collection of documents:
+````python
+from geesedb.index import Indexer
+
+indexer = Indexer(
+    database='/path/to/database',
+    file='/path/to/file'
+)
+indexer.run()
+````
+If nothing else is specified, the category of each key will be automatically inferred, which will be used to create the tables.
+This can also be inputted manually while defining the ``Indexer`` with the parameter ``schema_dict``, which can take 4 options:
+`<id>` (only one can be defined), `<indexable>` for text used on the retrieval phase, `<metadata>` for keys that can have a separate table, or `<other>` if the data in that key is not relevant for retrieval.
+Right now the values of the JSONL file should be of numerical type or strings, as strings are not supported yet.
+
+A few more parameters can be chosen such as ``tokenization_method``, which can be set either to ``syntok``, ``nltk`` or any other specified function. The stop words set ``stop_words`` can be initialized with ``nltk``, ``lucene`` or ``None``, in which case only the characters in ``delete_chars`` will be included in the stop word list. The stemmer options are ``porter`` and ``snowball``, and in the case of the last one, the language can be specified in ``language``  from any of the nltk available options.
+
+From [nltk documentation](https://www.nltk.org/api/nltk.stem.snowball.html):
+```python
+>>> from nltk.stem import SnowballStemmer
+>>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
+arabic danish dutch english finnish french german hungarian
+italian norwegian porter portuguese romanian russian
+spanish swedish
+```
+
 ## How do I search?
 After indexing in the data, it is really easy to construct a first stage ranking using BM25:
 

diff --git a/geesedb/index/__init__.py b/geesedb/index/__init__.py
@@ -2,5 +2,9 @@
 from .entities_from_csv import EntitiesFromCSV
 from .fulltext_from_ciff import FullTextFromCiff
 from .fulltext_from_csv import FullTextFromCSV
+from .indexer.indexer import Indexer
+from .indexer.terms_processor import TermsProcessor
+from .indexer.doc_readers import read_from_WaPo_json
 
-__all__ = ['FullTextFromCSV', 'AuthorsFromCSV', 'FullTextFromCiff', 'EntitiesFromCSV']
+__all__ = ['FullTextFromCSV', 'AuthorsFromCSV', 'FullTextFromCiff', 'EntitiesFromCSV', 'Indexer', 'TermsProcessor',
+           'read_from_WaPo_json']
diff --git a/geesedb/index/automatic_schema/__init__.py b/geesedb/index/automatic_schema/__init__.py
diff --git a/geesedb/index/automatic_schema/schema.py b/geesedb/index/automatic_schema/schema.py
@@ -0,0 +1,187 @@
+from geesedb.resources.schema.names import names
+from ..utils import _get_all_values_from_json_key
+
+import time
+import re
+import random
+import numpy as np
+from typing import Dict, List
+from dateutil.parser import parse as parse_time, ParserError
+from duckdb import DuckDBPyConnection
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Infer variable functions
+# ----------------------------------------------------------------------------------------------------------------------
+
+def infer_time(date_val, unix_now) -> bool:
+    try:
+        parse_time(str(date_val))
+        return True
+    except (OverflowError, ParserError, ValueError) as e:
+        pass
+    try:
+        int(date_val)
+    except (ValueError, TypeError) as e:
+        return False
+    if isinstance(date_val, int) and len(str(date_val)) in {10, 13, 16, 19}:
+        if 0 < date_val < int(str(unix_now)[:len(str(date_val))]):
+            return True
+    else:
+        return False
+
+
+def infer_name(name_val, tokenizer_function, threshold=0.5) -> bool:
+    # https://github.com/dangayle/first-name-gender/blob/master/names/names.py
+    tokens = tokenizer_function(name_val)
+    count = 0
+    token_len = 0
+    if isinstance(name_val, str):
+        for word in tokens:
+            token_len += 1
+            if word.value.lower() in names:
+                count += 1
+    else:
+        return False
+    if token_len != 0 and count / token_len >= threshold:
+        return True
+    else:
+        return False
+
+
+def infer_ids(ids_list, std_threshold=1, list_rate=0.3) -> int:
+    """
+    0: not an id
+    1: might be str. id
+    2: might be increasing int id
+    3: might be another kind of id, but less likely
+    """
+    # check if all values are unique
+    if len(ids_list) != len(set(ids_list)):
+        return 0
+
+    # check the length distribution (better if there's little deviation)
+    lengths = []
+    int_type_count = 0
+    ids_random_vals = random.sample(ids_list, int(list_rate * len(ids_list)))
+    for val in ids_random_vals:
+        lengths.append(len(val[0]))
+        if isinstance(val[0], int):
+            int_type_count += 1
+    if int_type_count == 0 and np.std(
+            lengths) <= std_threshold:  # look at distribution if it's str
+        return 1
+    elif int_type_count == len(ids_random_vals) and np.max(ids_list) - np.min(ids_list) == len(ids_list) - 1:
+        return 2
+    else:
+        return 3
+
+
+def infer_url(url_val) -> (bool, bool):  # is_url, is_string
+    if isinstance(url_val, str):
+        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([" \
+                r"^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
+        url = re.findall(regex, url_val)
+        if len(url) != 0:
+            return True, True
+        else:
+            return False, True
+    else:
+        return False, False
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Assess if value is important for the schema creation, an ID, or used for search
+# ----------------------------------------------------------------------------------------------------------------------
+
+def get_true_percentage(dict_list: List, upper_threshold_list=None, lower_threshold_list=None) -> bool:
+    if upper_threshold_list is None:
+        upper_threshold_list = [0.8, 0.8, 0.65]
+    if lower_threshold_list is None:
+        lower_threshold_list = [0.05, 0.05, 0.05]
+    assert len(dict_list) == len(upper_threshold_list) == len(lower_threshold_list)
+    count = 0
+    for d, t_up, t_low in zip(dict_list, upper_threshold_list, lower_threshold_list):
+        perc = d[True] / (d[True] + d[False])
+        if perc >= t_up:
+            return True
+        elif perc < t_low:  # if all have really low percentages, return True to break loop
+            count += 1
+    if count == len(dict_list):
+        return True
+    return False
+
+
+def infer_variable(connection: DuckDBPyConnection, dict_keys: List, tokenizer_function, schema_dict=None,
+                   stop_step: int = 1000, max_step: int = 10000) -> Dict:
+    """
+    Calls all above functions for every variable.
+
+    decisions for schema_dict include: <id>, <indexable>, <metadata>, <other>
+    """
+    unix_now = int(time.time_ns())
+    decisions = {}
+    id_present = False
+
+    for key in dict_keys:
+        if schema_dict is not None and key in schema_dict:
+            if schema_dict[key] == '<id>':
+                if id_present:
+                    raise Exception("An ID has already been specified or inferred!")
+                decisions[key] = schema_dict[key]
+                id_present = True
+            elif schema_dict[key] in ['<indexable>', '<metadata>', '<other>']:
+                decisions[key] = schema_dict[key]
+            else:
+                decisions[key] = '<other>'
+            continue
+
+        val_list = _get_all_values_from_json_key(connection, key)
+        url_dict = {True: 0, False: 0}
+        time_dict = {True: 0, False: 0}
+        names_dict = {True: 0, False: 0}
+        is_string = {True: 0, False: 0}
+        for i, val in enumerate(val_list):
+            val = val[0][1:-1]  # delete quotes from reading from table
+            is_url, is_string_bool = infer_url(val)
+            url_dict[is_url] += 1
+            is_string[is_string_bool] += 1
+            time_dict[infer_time(val, unix_now)] += 1
+            names_dict[infer_name(val, tokenizer_function)] += 1
+            if i != 0 and i % stop_step == 0 and get_true_percentage([url_dict, time_dict, names_dict]):
+                break
+            if i == max_step:
+                break
+        url_dec = url_dict[True] / (url_dict[True] + url_dict[False])
+        if url_dec >= 0.8:
+            decisions[key] = '<url>'
+            continue
+
+        time_dec = time_dict[True] / (time_dict[True] + time_dict[False])
+        if time_dec >= 0.8:
+            decisions[key] = '<other>'
+            continue
+
+        names_dec = names_dict[True] / (names_dict[True] + names_dict[False])
+        if names_dec >= 0.65:
+            decisions[key] = '<metadata>'
+            continue
+
+        ids_int = infer_ids(val_list)
+        if ids_int in [1, 2] and not id_present:
+            id_present = True
+            decisions[key] = '<doc_id>'
+            continue
+        elif ids_int == 3:
+            decisions[key] = '<other>'
+            continue
+
+        is_string_dec = is_string[True] / (is_string[True] + is_string[False])
+        if is_string_dec >= 0.95:  # not 100 adjusting for potential errors
+            decisions[key] = '<indexable>'
+        else:
+            decisions[key] = '<other>'
+
+    print('Final variable decisions:')
+    print(decisions)
+    return decisions
diff --git a/geesedb/index/indexer/__init__.py b/geesedb/index/indexer/__init__.py
diff --git a/geesedb/index/indexer/doc_readers.py b/geesedb/index/indexer/doc_readers.py
@@ -0,0 +1,55 @@
+from resiliparse.parse.html import HTMLTree
+from resiliparse.extract.html2text import extract_plain_text
+
+from typing import Dict
+import pandas as pd
+
+doc_general = {'collection_id': None,
+               'doc_id': None,
+               'title': '',
+               'date': '',
+               'authors': '',
+               'content': '',
+               'others': {}}
+
+
+def read_from_WaPo_json(line: {}, doc_id: int, include_links: bool) -> doc_general:
+    """
+    Read input docs from the JSONL file and convert it to the general format.
+    """
+    if line['id'] is None:
+        raise IOError('Missing collection ID')
+    d: doc_general = {'collection_id': line['id'], 'doc_id': doc_id, 'title': line['title'],
+                      'date': line['published_date'], 'authors': line['author']}
+    # 'authors': [author for author in line['author'].split(',')]
+    text = ''
+    if 'contents' in line:
+        for dict_ in line['contents']:  # types: kicker, title, image (fullcaption), byline, sanitized_html
+            if (dict_ is not None) and ('content' in dict_) and ('type' in dict_):
+                if dict_['type'] == 'kicker' or dict_['type'] == 'title' or \
+                        (dict_['type'] == 'sanitized_html' and dict_['mime'] == 'text/plain'):
+                    text += str(dict_['content']) + ' '
+                elif dict_['type'] == 'sanitized_html' and dict_['mime'] == 'text/html':
+                    text += str(extract_plain_text(HTMLTree.parse(dict_['content']), links=include_links,
+                                                   list_bullets=False)) + ' '
+        d['content'] = text
+    elif 'content' in line:
+        d['content'] = line['content']
+    else:
+        raise IOError('Contents in the .jl file not found.')
+
+    # for any other key not mentioned before, its content goes to others
+    # for k in line.keys():
+    #     if k not in d.keys():
+    #         d['others'] = line[k]
+
+    return d
+
+
+def read_raw_json(file_path: str, n_rows: int = 20000) -> Dict:
+    return pd.read_json(file_path, lines=True, nrows=n_rows).to_dict(orient='list')
+
+
+def read_raw_CSV(file_path: str, sep: str = ',', n_rows: int = 20000) -> Dict:
+    # sep='\t' for tab separated file
+    return pd.read_csv(file_path, sep=sep, n_rows=n_rows).to_dict(orient='list')