viswavi · viswavi · Oct 23, 2022 · Oct 23, 2022 · Oct 23, 2022 · Oct 23, 2022
diff --git a/CMVC_main_NYT.py b/CMVC_main_NYT.py
@@ -2,7 +2,7 @@
 import gensim
 from preprocessing import SideInfo  # For processing data and side information
 from embeddings_multi_view import Embeddings
-from utils import *
+from cmvc_utils import *
 import os, argparse, pickle, codecs
 from collections import defaultdict as ddict
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"

diff --git a/CMVC_main_opiec.py b/CMVC_main_opiec.py
@@ -1,10 +1,11 @@
 from helper import *
 from preprocessing import SideInfo  # For processing data and side information
 from embeddings_multi_view import Embeddings
-from utils import *
+from cmvc_utils import *
 import os, argparse, pickle, codecs
 from collections import defaultdict as ddict
 os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
 ''' *************************************** DATASET PREPROCESSING **************************************** '''
 
 class CMVC_Main(object):
@@ -20,10 +21,10 @@ def read_triples(self):
         self.amb_mentions = {}  # Contains all ambiguous mentions
         self.isAcronym = {}  # Contains all mentions which can be acronyms
 
-        print('dataset:', args.dataset)
-        if args.dataset == 'OPIEC59k':
+        print('dataset:', self.p.dataset)
+        if self.p.dataset == 'OPIEC59k':
             print('load OPIEC_dataset ... ')
-            self.triples_list = pickle.load(open(args.data_path, 'rb'))
+            self.triples_list = pickle.load(open(self.p.data_path, 'rb'))
 
             ''' Ground truth clustering '''
             self.true_ent2clust = ddict(set)
@@ -34,7 +35,7 @@ def read_triples(self):
 
         else:
             if not checkFile(fname):
-                with codecs.open(args.data_path, encoding='utf-8', errors='ignore') as f:
+                with codecs.open(self.p.data_path, encoding='utf-8', errors='ignore') as f:
                     for line in f:
                         trp = json.loads(line.strip())
 
@@ -118,7 +119,7 @@ def embedKG(self):
         description='CESI: Canonicalizing Open Knowledge Bases using Embeddings and Side Information')
     parser.add_argument('-data', dest='dataset', default='OPIEC59k', help='Dataset to run CESI on')
     parser.add_argument('-split', dest='split', default='test', help='Dataset split for evaluation')
-    parser.add_argument('-data_dir', dest='data_dir', default='../data', help='Data directory')
+    parser.add_argument('-l', dest='data_dir', default='../data', help='Data directory')
     parser.add_argument('-out_dir', dest='out_dir', default='../output', help='Directory to store CESI output')
     parser.add_argument('-reset', dest="reset", action='store_true', default=True,
                         help='Clear the cached files (Start a fresh run)')
@@ -198,7 +199,7 @@ def embedKG(self):
                         help='Otherwise use subsampling weighting like in word2vec', default=True)
 
     parser.add_argument('-lr', '--learning_rate', default=0.0001, type=float)
-    parser.add_argument('-cpu', '--cpu_num', default=12, type=int)
+    parser.add_argument('-cpu', '--cpu_num', default=4, type=int)
     parser.add_argument('-init', '--init_checkpoint', default=None, type=str)
     parser.add_argument('--warm_up_steps', default=None, type=int)
 
@@ -210,7 +211,11 @@ def embedKG(self):
     parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET')
     parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET')
     parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension')
+    parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeded-k-means++", "pc"], help='Embedding dimension')
+    parser.add_argument('--num_cluster_seeds', default=None, type=int, help='Number of cluster seeds to use, if simulating user seed feedback')
     parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering")
+    parser.add_argument('--save_model', action="store_true", help="Whether or not to serialize and save model outputs and parameters to disk")
+    parser.add_argument('--unnormalize', action="store_true", help="Whether to normalize each point before clustering")
 
     # word2vec and iteration hyper-parameters
     parser.add_argument('-retrain_literal_embeds', dest='retrain_literal_embeds', default=True,
@@ -249,4 +254,4 @@ def embedKG(self):
 
     cmvc = CMVC_Main(args)  # Loading KG triples
     cmvc.get_sideInfo()  # Side Information Acquisition
-    cmvc.embedKG()  # Learning embedding for Noun and relation phrases
+    cmvc.embedKG()  # Learning embedding for Noun and relation phrases
diff --git a/CMVC_main_reverb45k.py b/CMVC_main_reverb45k.py
@@ -2,10 +2,9 @@
 import gensim
 from preprocessing import SideInfo  # For processing data and side information
 from embeddings_multi_view import Embeddings
-from utils import *
+from cmvc_utils import *
 import os, argparse, pickle, codecs
 from collections import defaultdict as ddict
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 ''' *************************************** DATASET PREPROCESSING **************************************** '''
 
 
@@ -23,10 +22,10 @@ def read_triples(self):
         self.amb_mentions = {}  # Contains all ambiguous mentions
         self.isAcronym = {}  # Contains all mentions which can be acronyms
 
-        print('dataset:', args.dataset)
-        if args.dataset == 'OPIEC':
+        print('dataset:', self.p.dataset)
+        if self.p.dataset == 'OPIEC':
             print('load OPIEC_dataset ... ')
-            self.triples_list = pickle.load(open(args.data_path, 'rb'))
+            self.triples_list = pickle.load(open(self.p.data_path, 'rb'))
 
             ''' Ground truth clustering '''
             self.true_ent2clust = ddict(set)
@@ -37,7 +36,7 @@ def read_triples(self):
 
         else:
             if not checkFile(fname):
-                with codecs.open(args.data_path, encoding='utf-8', errors='ignore') as f:
+                with codecs.open(self.p.data_path, encoding='utf-8', errors='ignore') as f:
                     for line in f:
                         trp = json.loads(line.strip())
 
@@ -87,17 +86,17 @@ def read_triples(self):
         print('self.true_clust2ent:', len(self.true_clust2ent))
         print('self.true_ent2clust:', len(self.true_ent2clust))
 
-        folder = '../file/' + args.dataset + '/'
+        folder = '../file/' + self.p.dataset + '/'
         if not os.path.exists(folder):
             os.makedirs(folder)
 
-        fname1, fname2 = '../file/' + args.dataset + '/self.ent2true_link_list', '../file/' + args.dataset + '/self.ent2true_link'
+        fname1, fname2 = '../file/' + self.p.dataset + '/self.ent2true_link_list', '../file/' + self.p.dataset + '/self.ent2true_link'
         if not checkFile(fname1) or not checkFile(fname2):
             print('generate ent2true_link_dict')
             self.ent2true_link_list = dict()
             for trp in self.triples_list:
                 sub, obj = trp['triple'][0], trp['triple'][2]
-                if args.dataset == 'OPIEC':
+                if self.p.dataset == 'OPIEC':
                     true_sub_link, true_obj_link = trp['subject_wiki_link'], trp['object_wiki_link']
                 else:
                     true_sub_link, true_obj_link = trp['true_sub_link'], trp['true_obj_link']
@@ -161,7 +160,7 @@ def embedKG(self):
 
         if not checkFile(fname1) or not checkFile(fname2):
             embed = Embeddings(self.p, self.side_info, true_ent2clust=self.true_ent2clust,
-                               true_clust2ent=self.true_clust2ent, triple_list=self.triples_list)
+                               true_clust2ent=self.true_clust2ent, triple_list=self.triples_list, num_reinit=self.p.num_reinit)
             embed.fit()
 
             self.ent2embed = embed.ent2embed  # Get the learned NP embeddings
@@ -259,18 +258,23 @@ def embedKG(self):
                         help='Otherwise use subsampling weighting like in word2vec', default=True)
 
     parser.add_argument('-lr', '--learning_rate', default=0.0001, type=float)
-    parser.add_argument('-cpu', '--cpu_num', default=12, type=int)
+    parser.add_argument('-cpu', '--cpu_num', default=4, type=int)
     parser.add_argument('-init', '--init_checkpoint', default=None, type=str)
     parser.add_argument('--warm_up_steps', default=None, type=int)
 
-    parser.add_argument('--save_checkpoint_steps', default=10000, type=int)
+    parser.add_argument('--save_checkpoint_steps', default=1000, type=int)
     parser.add_argument('--valid_steps', default=10000, type=int)
     parser.add_argument('--log_steps', default=100, type=int, help='train log every xx steps')
     parser.add_argument('--test_log_steps', default=1000, type=int, help='valid/test log every xx steps')
 
     parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET')
     parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET')
     parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension')
+    parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeded-k-means++", "pc"], help='Embedding dimension')	
+    parser.add_argument('--num_cluster_seeds', default=None, type=int, help='Number of cluster seeds to use, if simulating user seed feedback')	
+    parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering")	
+    parser.add_argument('--save_model', action="store_true", help="Whether or not to serialize and save model outputs and parameters to disk")	
+    parser.add_argument('--unnormalize', action="store_true", help="Whether to normalize each point before clustering")
 
     # word2vec and iteration hyper-parameters
     parser.add_argument('-retrain_literal_embeds', dest='retrain_literal_embeds', default=True,

diff --git a/Context_view.py b/Context_view.py
@@ -22,9 +22,12 @@ def __del__(self):
         print("BertClassificationModel del ... ")
 
     def forward(self, batch_sentences):
-        batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
-                                                           max_length=self.max_length,
-                                                           pad_to_max_length=True)
+        try:
+            batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
+                                                            max_length=self.max_length,
+                                                            pad_to_max_length=True)
+        except:
+            breakpoint()
         input_ids = torch.tensor(batch_tokenized['input_ids']).cuda()
         attention_mask = torch.tensor(batch_tokenized['attention_mask']).cuda()
         bert_output = self.bert(input_ids, attention_mask=attention_mask)
@@ -44,7 +47,7 @@ def __init__(self, params, side_info, input_list, cluster_predict_list, true_ent
         self.BERT_self_training_time = BERT_self_training_time
         self.sub_uni2triple_dict = sub_uni2triple_dict
         self.rel_id2sentence_list = rel_id2sentence_list
-        self.batch_size = 40
+        self.batch_size = 30
         if self.p.dataset == 'reverb45k_change':
             self.epochs = 100
         else:
@@ -156,6 +159,7 @@ def fine_tune(self):
                     if i == (batch_count - 1):
                         real_time = time.strftime("%Y_%m_%d") + ' ' + time.strftime("%H:%M:%S")
                         print(real_time, "Epoch: %d, Loss: %.4f" % (epoch, avg_epoch_loss))
+            breakpoint()
 
             self.BERT_CLS = cls_output.detach().cpu().numpy()
             pickle.dump(self.BERT_CLS, open(fname1, 'wb'))