Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement seeded clustering #1

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion CMVC_main_NYT.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import gensim
from preprocessing import SideInfo # For processing data and side information
from embeddings_multi_view import Embeddings
from utils import *
from cmvc_utils import *
import os, argparse, pickle, codecs
from collections import defaultdict as ddict
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
Expand Down
21 changes: 13 additions & 8 deletions CMVC_main_opiec.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from helper import *
from preprocessing import SideInfo # For processing data and side information
from embeddings_multi_view import Embeddings
from utils import *
from cmvc_utils import *
import os, argparse, pickle, codecs
from collections import defaultdict as ddict
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

''' *************************************** DATASET PREPROCESSING **************************************** '''

class CMVC_Main(object):
Expand All @@ -20,10 +21,10 @@ def read_triples(self):
self.amb_mentions = {} # Contains all ambiguous mentions
self.isAcronym = {} # Contains all mentions which can be acronyms

print('dataset:', args.dataset)
if args.dataset == 'OPIEC59k':
print('dataset:', self.p.dataset)
if self.p.dataset == 'OPIEC59k':
print('load OPIEC_dataset ... ')
self.triples_list = pickle.load(open(args.data_path, 'rb'))
self.triples_list = pickle.load(open(self.p.data_path, 'rb'))

''' Ground truth clustering '''
self.true_ent2clust = ddict(set)
Expand All @@ -34,7 +35,7 @@ def read_triples(self):

else:
if not checkFile(fname):
with codecs.open(args.data_path, encoding='utf-8', errors='ignore') as f:
with codecs.open(self.p.data_path, encoding='utf-8', errors='ignore') as f:
for line in f:
trp = json.loads(line.strip())

Expand Down Expand Up @@ -118,7 +119,7 @@ def embedKG(self):
description='CESI: Canonicalizing Open Knowledge Bases using Embeddings and Side Information')
parser.add_argument('-data', dest='dataset', default='OPIEC59k', help='Dataset to run CESI on')
parser.add_argument('-split', dest='split', default='test', help='Dataset split for evaluation')
parser.add_argument('-data_dir', dest='data_dir', default='../data', help='Data directory')
parser.add_argument('-l', dest='data_dir', default='../data', help='Data directory')
parser.add_argument('-out_dir', dest='out_dir', default='../output', help='Directory to store CESI output')
parser.add_argument('-reset', dest="reset", action='store_true', default=True,
help='Clear the cached files (Start a fresh run)')
Expand Down Expand Up @@ -198,7 +199,7 @@ def embedKG(self):
help='Otherwise use subsampling weighting like in word2vec', default=True)

parser.add_argument('-lr', '--learning_rate', default=0.0001, type=float)
parser.add_argument('-cpu', '--cpu_num', default=12, type=int)
parser.add_argument('-cpu', '--cpu_num', default=4, type=int)
parser.add_argument('-init', '--init_checkpoint', default=None, type=str)
parser.add_argument('--warm_up_steps', default=None, type=int)

Expand All @@ -210,7 +211,11 @@ def embedKG(self):
parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET')
parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET')
parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension')
parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeded-k-means++", "pc"], help='Embedding dimension')
parser.add_argument('--num_cluster_seeds', default=None, type=int, help='Number of cluster seeds to use, if simulating user seed feedback')
parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering")
parser.add_argument('--save_model', action="store_true", help="Whether or not to serialize and save model outputs and parameters to disk")
parser.add_argument('--unnormalize', action="store_true", help="Whether to normalize each point before clustering")

# word2vec and iteration hyper-parameters
parser.add_argument('-retrain_literal_embeds', dest='retrain_literal_embeds', default=True,
Expand Down Expand Up @@ -249,4 +254,4 @@ def embedKG(self):

cmvc = CMVC_Main(args) # Loading KG triples
cmvc.get_sideInfo() # Side Information Acquisition
cmvc.embedKG() # Learning embedding for Noun and relation phrases
cmvc.embedKG() # Learning embedding for Noun and relation phrases
28 changes: 16 additions & 12 deletions CMVC_main_reverb45k.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import gensim
from preprocessing import SideInfo # For processing data and side information
from embeddings_multi_view import Embeddings
from utils import *
from cmvc_utils import *
import os, argparse, pickle, codecs
from collections import defaultdict as ddict
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
''' *************************************** DATASET PREPROCESSING **************************************** '''


Expand All @@ -23,10 +22,10 @@ def read_triples(self):
self.amb_mentions = {} # Contains all ambiguous mentions
self.isAcronym = {} # Contains all mentions which can be acronyms

print('dataset:', args.dataset)
if args.dataset == 'OPIEC':
print('dataset:', self.p.dataset)
if self.p.dataset == 'OPIEC':
print('load OPIEC_dataset ... ')
self.triples_list = pickle.load(open(args.data_path, 'rb'))
self.triples_list = pickle.load(open(self.p.data_path, 'rb'))

''' Ground truth clustering '''
self.true_ent2clust = ddict(set)
Expand All @@ -37,7 +36,7 @@ def read_triples(self):

else:
if not checkFile(fname):
with codecs.open(args.data_path, encoding='utf-8', errors='ignore') as f:
with codecs.open(self.p.data_path, encoding='utf-8', errors='ignore') as f:
for line in f:
trp = json.loads(line.strip())

Expand Down Expand Up @@ -87,17 +86,17 @@ def read_triples(self):
print('self.true_clust2ent:', len(self.true_clust2ent))
print('self.true_ent2clust:', len(self.true_ent2clust))

folder = '../file/' + args.dataset + '/'
folder = '../file/' + self.p.dataset + '/'
if not os.path.exists(folder):
os.makedirs(folder)

fname1, fname2 = '../file/' + args.dataset + '/self.ent2true_link_list', '../file/' + args.dataset + '/self.ent2true_link'
fname1, fname2 = '../file/' + self.p.dataset + '/self.ent2true_link_list', '../file/' + self.p.dataset + '/self.ent2true_link'
if not checkFile(fname1) or not checkFile(fname2):
print('generate ent2true_link_dict')
self.ent2true_link_list = dict()
for trp in self.triples_list:
sub, obj = trp['triple'][0], trp['triple'][2]
if args.dataset == 'OPIEC':
if self.p.dataset == 'OPIEC':
true_sub_link, true_obj_link = trp['subject_wiki_link'], trp['object_wiki_link']
else:
true_sub_link, true_obj_link = trp['true_sub_link'], trp['true_obj_link']
Expand Down Expand Up @@ -161,7 +160,7 @@ def embedKG(self):

if not checkFile(fname1) or not checkFile(fname2):
embed = Embeddings(self.p, self.side_info, true_ent2clust=self.true_ent2clust,
true_clust2ent=self.true_clust2ent, triple_list=self.triples_list)
true_clust2ent=self.true_clust2ent, triple_list=self.triples_list, num_reinit=self.p.num_reinit)
embed.fit()

self.ent2embed = embed.ent2embed # Get the learned NP embeddings
Expand Down Expand Up @@ -259,18 +258,23 @@ def embedKG(self):
help='Otherwise use subsampling weighting like in word2vec', default=True)

parser.add_argument('-lr', '--learning_rate', default=0.0001, type=float)
parser.add_argument('-cpu', '--cpu_num', default=12, type=int)
parser.add_argument('-cpu', '--cpu_num', default=4, type=int)
parser.add_argument('-init', '--init_checkpoint', default=None, type=str)
parser.add_argument('--warm_up_steps', default=None, type=int)

parser.add_argument('--save_checkpoint_steps', default=10000, type=int)
parser.add_argument('--save_checkpoint_steps', default=1000, type=int)
parser.add_argument('--valid_steps', default=10000, type=int)
parser.add_argument('--log_steps', default=100, type=int, help='train log every xx steps')
parser.add_argument('--test_log_steps', default=1000, type=int, help='valid/test log every xx steps')

parser.add_argument('--nentity', type=int, default=0, help='DO NOT MANUALLY SET')
parser.add_argument('--nrelation', type=int, default=0, help='DO NOT MANUALLY SET')
parser.add_argument('-embed_dims', dest='embed_dims', default=300, type=int, help='Embedding dimension')
parser.add_argument('--kmeans_initialization', default="k-means++", type=str, choices=["k-means++", "seeded-k-means++", "pc"], help='Embedding dimension')
parser.add_argument('--num_cluster_seeds', default=None, type=int, help='Number of cluster seeds to use, if simulating user seed feedback')
parser.add_argument('--num_reinit', type=int, default=10, help="Number of reinitializations to try for k-Means clustering")
parser.add_argument('--save_model', action="store_true", help="Whether or not to serialize and save model outputs and parameters to disk")
parser.add_argument('--unnormalize', action="store_true", help="Whether to normalize each point before clustering")

# word2vec and iteration hyper-parameters
parser.add_argument('-retrain_literal_embeds', dest='retrain_literal_embeds', default=True,
Expand Down
12 changes: 8 additions & 4 deletions Context_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ def __del__(self):
print("BertClassificationModel del ... ")

def forward(self, batch_sentences):
batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
max_length=self.max_length,
pad_to_max_length=True)
try:
batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
max_length=self.max_length,
pad_to_max_length=True)
except:
breakpoint()
input_ids = torch.tensor(batch_tokenized['input_ids']).cuda()
attention_mask = torch.tensor(batch_tokenized['attention_mask']).cuda()
bert_output = self.bert(input_ids, attention_mask=attention_mask)
Expand All @@ -44,7 +47,7 @@ def __init__(self, params, side_info, input_list, cluster_predict_list, true_ent
self.BERT_self_training_time = BERT_self_training_time
self.sub_uni2triple_dict = sub_uni2triple_dict
self.rel_id2sentence_list = rel_id2sentence_list
self.batch_size = 40
self.batch_size = 30
if self.p.dataset == 'reverb45k_change':
self.epochs = 100
else:
Expand Down Expand Up @@ -156,6 +159,7 @@ def fine_tune(self):
if i == (batch_count - 1):
real_time = time.strftime("%Y_%m_%d") + ' ' + time.strftime("%H:%M:%S")
print(real_time, "Epoch: %d, Loss: %.4f" % (epoch, avg_epoch_loss))
breakpoint()

self.BERT_CLS = cls_output.detach().cpu().numpy()
pickle.dump(self.BERT_CLS, open(fname1, 'wb'))
Expand Down
Loading