IBM
diff --git a/‎README.md
100644100755
+103-2 b/‎README.md
100644100755
+103-2
diff --git a/‎WordVectors.py
+203 b/‎WordVectors.py
+203
diff --git a/‎alignment.py
+47 b/‎alignment.py
+47
@@ -1,2 +1,103 @@
-# S4_semantic_shift
-code for reproducing results in AAAI 2021 paper
+# Fake it Till You Make it: Self-Supervised Semantic Shifts for Monolingual Word Embedding Tasks
+
+This code repository contains the code for the experiments seen in the paper  `Fake it Till You Make it: Self-Supervised Semantic Shifts for Monolingual Word Embedding Tasks` (2020).
+
+## Requirements
+
+This repository contains mainly Python3 routines and dependencies listed in `requirements.txt`. To install the dependencies using pip/venv, run:
+
+```
+pip3 install -r requirements.txt
+```
+
+## Setup
+
+After installing the requirements, run `setup.sh` to configure the environment and download the pre-trained word embeddings.
+```
+sh setup.sh
+```
+This will create the folders to store the results, and will download pre-trained vectors. The size of the download is approximately 500MB.
+
+Alternatively, the pre-trained embeddings can be downloaded [here](https://zenodo.org/record/3890109/files/wordvectors.zip?download=1).
+
+
+## Results
+
+
+### British English :gb: vs. American English :us:
+
+Results for the classification task on detecting semantic shift between British English and American English.
+
+** Requires the pre-trained word embeddings from BNC and COCA **
+
+To reproduce these results, run:
+
+```
+chmod +x ukus_experiment.sh
+./ukus_experiment.sh
+```
+
+By default, results are saved to `results/ukus/cls_results.txt`.
+
+|Method|Alignment|Accuracy|Precision|Recall|F1|
+|------|---------|--------|---------|------|--|
+|COS|global|0.35|0.71|0.19|0.3|
+|S4-D|global|0.45 +- 0.02|0.45 +- 0.02|0.45 +- 0.02|0.45 +- 0.03|
+|Noisy-Pairs|-|0.29|1.0|0.03|0.06|
+
+
+
+### SemEval-2020 Task on Unsupervised Lexical Semantic Change Detection
+
+Results for the binary classification task on semantic shift for multiple languages (SemEval2020 Task 1): English, German, Latin, and Swedish.
+
+** Requires the pre-trained embeddings from SemEval **
+
+To reproduce these results:
+
+```
+chmod +x semeval_experiment.sh
+./semeval_experiment.sh
+```
+
+By default, results are saved to `results/semeval/cls_results.txt`.
+
+|Method|Language|Mean acc.|Max acc.|
+|------|--------|---------|--------|
+| s4|english|0.62|0.7|
+| noise-aware|english|0.61|0.65|
+| top-10|english|0.59|0.68|
+| bot-10|english|0.58|0.68|
+| global|english|0.61|0.68|
+| top-5|english|0.59|0.65|
+| bot-5|english|0.57|0.68|
+
+
+
+### ArXiv Semantic Shift Discovery
+
+Word discovery experiment on the arXiv data set for subjects Artificial Intelligence (cs.AI) and Classical Physics (physics.class-ph). This table shows the list of top semantically shifted words uniquely discovered by Global, Noise-Aware and S4-A alignments, respectively. As well as the most shifted words commonly discovered by all three methods.
+
+** Requires the pre-trained embeddings from arXiv **
+
+To reproduce these results:
+
+```
+chmod +x arxiv_experiment.sh
+./arxiv_experiment.sh
+```
+
+The table of results is saved in `results/arxiv/table.txt`, the ranking correlation plot is saved in `results/arxiv/arxiv_ranking.pdf`.
+
+|Global|Noise-Aware|S4-A|Common| |
+|------|-----------|----|------|-|
+|agent||components|concepts|nodes|
+|approximation||element|density|phys|
+|boundary||mass|deterministic|polynomial|
+|conceptual||order|die|probability|
+|knowledge||solution|edge|respect|
+|plane||space|equations|rev|
+|reference||state|fields|rough|
+|rules||term|internal|rule|
+|system||time|light|tensor|
+|systems||vector|los|variables|
@@ -0,0 +1,203 @@
+import numpy as np
+from collections import OrderedDict
+from sklearn import preprocessing
+
+
+# This file contains the WordVectors class used to load and handle word embeddings
+def intersection(*args):
+    """
+    This function returns the intersection between WordVectors objects
+    I.e.: all words that occur in both objects simultaneously as well as their
+          respective word vectors
+    Returns: list(WordVectors) objects with intersecting words
+    """
+    if len(args) < 2:
+        print("! Error: intersection requires at least 2 WordVector objects")
+        return None
+    # Get intersecting words
+    # WARNING: using set intersection will affect the order of words
+    # in the original word vectors, to keep results consistent
+    # it is better to iterate over the list of words
+    # the resulting order will follow the first WordVectors's order
+    # Get intersecting words
+    common_words = set.intersection(*[set(wv.words) for wv in args])
+    # Get intersecting words following the order of first WordVector
+    words = [w for w in args[0].words if w in common_words]
+
+    # Retrieve vectors from a and b for intersecting words
+    wv_out = list()  # list of output WordVectors
+    for wv in args:
+        wv_out.append(WordVectors(words=words, vectors=[wv[w]for w in words]))
+
+    return wv_out
+
+
+def union(*args, f="average"):
+    """
+    Performs union of two or more word vectors, returning a new WordVectors
+    containing union of words and combination of vectors according to given
+    function.
+    Arguments:
+        *args   - list of WordVectors objects
+        f       - (str) function to use when combining word vectors (default to average)
+    Returns:
+        wv      - WordVectors as the union the input args
+    """
+
+    if f == 'average':
+        f = lambda x: sum(x)/len(x)
+
+    union_words = set.union(*[set(wv.words) for wv in args])
+
+    words = list(union_words)
+    vectors = np.zeros((len(words), args[0].dimension), dtype=float)
+    for i, w in enumerate(words):
+        # Get list of existing vectors for w
+        vecs = np.array([wv[w] for wv in args if w in wv])
+        vectors[i] = f(vecs)  # Combine vectors
+
+    wv_out = WordVectors(words=words, vectors=vectors)
+
+    return wv_out
+
+
+# Implements a WordVector class that performs mapping of word tokens to vectors
+# Stores words as
+class WordVectors:
+    """
+    WordVectors class containing methods for handling the mapping of words
+    to vectors.
+    Attributes
+    - word_id -- OrderedDict mapping word to id in list of vectors
+    - words -- list of words mapping id (index) to word string
+    - vectors -- n x dim matrix of word vectors, follows id order
+    - counts -- not used at the moment, designed to store word count
+    - dimension -- dimension of wordvectors
+    - zipped -- a zipped list of (word, vec) used to construct the object
+    - min_freq -- filter out words whose frequency is less than min_freq
+    """
+    def __init__(self, words=None, vectors=None, counts=None, zipped=None,
+                 input_file=None, centered=True, normalized=False,
+                 min_freq=0, word_frequency=None):
+
+        if words is not None and vectors is not None:
+            self.word_id = OrderedDict()
+            self.words = list()
+            for i, w in enumerate(words):
+                self.word_id[w] = i
+            self.words = list(words)
+            self.vectors = np.array(vectors)
+            self.counts = counts
+            self.dimension = len(vectors[0])
+        elif zipped:
+            pass
+        elif input_file:
+            self.dimension = 0
+            self.word_id = dict()
+            self.words = list()
+            self.counts = dict()
+            self.vectors = None
+            self.read_file(input_file)
+
+        if centered:
+            self.center()
+        if normalized:
+            self.normalize()
+
+        if word_frequency:
+            self.filter_frequency(min_freq, word_frequency)
+
+    def center(self):
+        self.vectors = self.vectors - self.vectors.mean(axis=0, keepdims=True)
+
+    def normalize(self):
+        self.vectors = preprocessing.normalize(self.vectors, norm="l2")
+
+    def get_words(self):
+        return self.word_id.keys()
+
+    # Returns a numpy (m, dim) array for a given list of words
+    # I.e.: select vectors whose word are in argument words
+    def get_vectors_from_words(self, words):
+        vectors = np.zeros((len(words), self.dimension))
+        for i, w in enumerate(words):
+            vectors[i] = self[w]
+        return vectors
+
+    # Return (word,vec) for given word
+    # In future versions may only return self.vectors
+    def loc(self, word, return_word=False):
+        if return_word:
+            return word, self.vectors[self.word_id[word]]
+        else:
+            return self.vectors[self.word_id[word]]
+
+    def get_count(self, word):
+        return self.freq[self.word_id[word]]
+
+    # Get word, vector pair from id
+    def iloc(self, id_query, return_word=False):
+        if return_word:
+            return self.words[id_query], self.vectors[id_query]
+        else:
+            return self.vectors[id_query]
+
+    # Overload [], given word w returns its vector
+    def __getitem__(self, key):
+        if isinstance(key, int) or isinstance(key, np.int64):
+            return self.iloc(key)
+        elif isinstance(key, slice):  # slice
+            return ([w for w in self.words[key.start: key.stop]],
+                    [v for v in self.vectors[key.start: key.stop]])
+        return self.loc(key)
+
+    def __len__(self):
+        return len(self.words)
+
+    def __contains__(self, word):
+        return word in self.word_id
+
+    def filter_frequency(self, min_freq, word_frequency):
+        print("Filtering %d" % min_freq)
+        words_kept = list()
+        vectors_kept = list()
+        for word, vec in zip(self.words, self.vectors):
+            if word in word_frequency and word_frequency[word] > min_freq:
+                words_kept.append(word)
+                vectors_kept.append(vec)
+
+        self.words = words_kept
+        self.vectors = np.array(vectors_kept)
+        self.word_id = OrderedDict()
+        for i, w in enumerate(self.words):
+            self.word_id[w] = i
+
+        print(" - Found %d words" % len(self.words))
+
+    # Read file in following format:
+    # n_items dim
+    def read_file(self, path):
+        with open(path) as fin:
+            n_words, dim = map(int, fin.readline().rstrip().split(" ", 1))
+            self.dimension = dim
+            # print("Reading WordVectors (%d,%d)" % (n_words, dim))
+
+            # Use this function to process line reading in map
+            def process_line(s):
+                s = s.rstrip().split(" ", 1)
+                w = s[0]
+                v = np.array(s[1].split(" "), dtype=float)
+                return w, v
+
+            data = map(process_line, fin.readlines())
+            self.words, self.vectors = zip(*data)
+            self.words = list(self.words)
+            self.word_id = {w: i for i, w in enumerate(self.words)}
+            self.vectors = np.array(self.vectors, dtype=float)
+
+    def save_txt(self, path):
+        with open(path, "w") as fout:
+            fout.write("%d %d\n" % (len(self.word_id), self.dimension))
+            for word, vec in zip(self.words, self.vectors):
+                v_string = " ".join(map(str, vec))
+                fout.write("%s %s\n" % (word, v_string))
@@ -0,0 +1,47 @@
+from scipy.linalg import orthogonal_procrustes
+import numpy as np
+from WordVectors import WordVectors
+
+# Word alignment module
+def align(wv1, wv2, anchor_indices=None, anchor_words=None, anchor_top=None,
+           anchor_bot=None, anchor_random=None,
+           exclude={},
+           method="procrustes"):
+    """
+    Implement OP alignment for a given set of landmarks.
+    If no landmark is given, performs global alignment.
+    Arguments:
+        wv1 - WordVectors object to align to wv2
+        wv2 - Target WordVectors. Will align wv1 to it.
+        anchor_indices - (optional) uses word indices as landmarks
+        anchor_words - (optional) uses words as landmarks
+        exclude - set of words to exclude from alignment
+        method - Alignment objective. Currently only supports orthogonal procrustes.
+    """
+    if anchor_top is not None:
+        v1 = [wv1.vectors[i] for i in range(anchor_top) if wv1.words[i] not in exclude]
+        v2 = [wv2.vectors[i] for i in range(anchor_top) if wv2.words[i] not in exclude]
+    elif anchor_bot is not None:
+        v1 = [wv1.vectors[-i] for i in range(anchor_bot) if wv1.words[i] not in exclude]
+        v2 = [wv2.vectors[-i] for i in range(anchor_bot) if wv2.words[i] not in exclude]
+    elif anchor_random is not None:
+        anchors = np.random.choice(range(len(wv1.vectors)), anchor_random)
+        v1 = [wv1.vectors[i] for i in anchors if wv1.words[i] not in exclude]
+        v2 = [wv2.vectors[i] for i in anchors if wv2.words[i] not in exclude]
+    elif anchor_indices is not None:
+        v1 = [wv1.vectors[i] for i in indices if wv1.words[i] not in exclude]
+        v2 = [wv2.vectors[i] for i in indices if wv2.words[i] not in exclude]
+    elif anchor_words is not None:
+        v1 = [wv1[w] for w in anchor_words if w not in exclude]
+        v2 = [wv2[w] for w in anchor_words if w not in exclude]
+    else:  # just use all words
+        v1 = [wv1[w] for w in wv1.words if w not in exclude]
+        v2 = [wv2[w] for w in wv2.words if w not in exclude]
+    v1 = np.array(v1)
+    v2 = np.array(v2)
+    if method=="procrustes":  # align with OP
+        Q, _ = orthogonal_procrustes(v1, v2)
+
+    wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q))
+
+    return wv1_, wv2, Q