Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PINA example folder #225

Merged
merged 1 commit into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions examples/pina/DataPrep_forXCrepo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import scipy.sparse as smat
import numpy as np
from pecos.utils import smat_util
import sklearn
import os
import sys
from xclib.data import data_utils

def main():
parser = argparse.ArgumentParser(description='DataPrep_forXCrepo')
parser.add_argument('--work_dir', type=str, default='.')
parser.add_argument('--dataset', type=str, default='LF-Amazon-131K')
args = parser.parse_args()
print(args)

cur_dir = f'{args.work_dir}/dataset/{args.dataset}'

if args.dataset in ['LF-Amazon-131K','LF-WikiSeeAlso-320K','LF-Amazon-1.3M']:
# Read files with features and labels (old format from XMLRepo)
features, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/train.txt')
features = features.astype(np.float32)
sklearn.preprocessing.normalize(features,copy=False)
smat.save_npz(f'{cur_dir}/X_bow.trn.npz',features)
smat.save_npz(f'{cur_dir}/normalized/Y.trn.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.trn.npz',tabels)

features, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/test.txt')
features = features.astype(np.float32)
sklearn.preprocessing.normalize(features,copy=False)
smat.save_npz(f'{cur_dir}/X_bow.tst.npz',features)
smat.save_npz(f'{cur_dir}/normalized/Y.tst.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.tst.npz',tabels)

TEST = data_utils.read_sparse_file(f'{cur_dir}/Yf.txt',header=True)
sklearn.preprocessing.normalize(TEST,copy=False)
smat.save_npz(f"{cur_dir}/Y_bow.npz",TEST)
elif args.dataset in ['LF-Wikipedia-500K']:
# Read files with labels (old format from XMLRepo)
_, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/train.txt')
smat.save_npz(f'{cur_dir}/normalized/Y.trn.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.trn.npz',tabels)
_, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/test.txt')
smat.save_npz(f'{cur_dir}/normalized/Y.tst.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.tst.npz',tabels)

# Read files with features (BoW, dim = 500000. The feature in the old format has dim = 2381304.)
X_trn = data_utils.read_sparse_file(f"{cur_dir}/trn_X_Xf.txt", header=True)
X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')

X_tst = data_utils.read_sparse_file(f"{cur_dir}/tst_X_Xf.txt", header=True)
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')

L_bow = data_utils.read_sparse_file(f"{cur_dir}/lbl_X_Xf.txt", header=True)
L_bow = sklearn.preprocessing.normalize(L_bow,norm='l2')

smat.save_npz(f"{cur_dir}/Y_bow.npz",L_bow)
smat.save_npz(f"{cur_dir}/X_bow.trn.npz",X_trn)
smat.save_npz(f"{cur_dir}/X_bow.tst.npz",X_tst)
else:
raise ValueError(f'Dataset {args.dataset} is not supported yet!')


if __name__ == "__main__":
main()
42 changes: 42 additions & 0 deletions examples/pina/Ensemble-PINA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pecos.utils.smat_util import sorted_csr, CsrEnsembler, load_matrix, Metrics
import scipy.sparse as smat
import argparse
import os

def main():
parser = argparse.ArgumentParser(description='PrepareXYstack')
parser.add_argument('--work_dir', type=str, default='.')
parser.add_argument('--dataset', type=str, default='LF-Amazon-131K')
parser.add_argument('--model_name', type=str, default='v0')
parser.add_argument('--DS_model_names', type=str, default='v0,v0-s1,v0-s2', help="The DS_model_name should be seperated by ','. For example: 'v0,v0-s1,v0-s2'.")
parser.add_argument('--feature_name', type=str, default='BoW')
parser.add_argument('--ens_name', type=str, default='softmax', choices = ['rank', 'softmax', 'sigmoid'])
parser.add_argument('--L_option', type=str, default='Lft_xrt')
parser.add_argument('--Pk', type=str, default='5')
parser.add_argument('--Use_A', type=str, default='false')
args = parser.parse_args()
print(args)


feature_dir=f"{args.work_dir}/dataset/{args.dataset}"
TAGS = args.DS_model_names.split(',')
assert len(TAGS)>1 # Assume to ensemble at least 2 models!

P_paths = []
for tag in TAGS:
P_paths.append(f"{args.work_dir}/models_LF/xtransformer/{args.dataset}/{args.model_name}/{args.feature_name}/XYstack/downstream/{tag}/{args.Pk}/{args.L_option}/P.20.npz")

Y_true = sorted_csr(load_matrix(f"{args.work_dir}/dataset/{args.dataset}/raw/Y.tst.npz").tocsr())
Y_pred = [sorted_csr(load_matrix(pp).tocsr()) for pp in P_paths]
print("==== evaluation results ====")
ens = getattr(CsrEnsembler, f"{args.ens_name}_average")
cur_pred = ens(*Y_pred)
print(Metrics.generate(Y_true, cur_pred, topk=10))
PATH = f"{args.work_dir}/models_LF/xtransformer/{args.dataset}/{args.model_name}/{args.feature_name}/XYstack/downstream/{args.DS_model_names}/{args.Pk}/{args.L_option}"
os.makedirs(PATH,exist_ok=True)
smat.save_npz(f"{PATH}/P.20.{args.ens_name}.npz",cur_pred)
print("Ensembled P matrix saved!")
print(f"Saved model path: {PATH}")
print(f"To evaluate, please use this path with ./scripts/Ensemble_evaluations.sh")
if __name__ == "__main__":
main()
244 changes: 244 additions & 0 deletions examples/pina/PINA_augmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
from pecos.xmc.xtransformer.model import XTransformer
import scipy.sparse as smat
import numpy as np
from pecos.utils import smat_util
from pecos.utils.featurization.text.preprocess import Preprocessor
import sklearn
import os
from pecos.xmc import Indexer, LabelEmbeddingFactory
import sys
from pecos.core import clib as pecos_clib
from tqdm import tqdm
import argparse



def CSR_rowwise_softmax(P):
P.data = np.exp(P.data).astype(np.float32)
P = sklearn.preprocessing.normalize(P, norm='l1')
return P

def main():
parser = argparse.ArgumentParser(description='PrepareXYstack')
parser.add_argument('--model_name', type=str, required=True)
parser.add_argument('--feature_name', type=str, default='BoW')
parser.add_argument('--work_dir', type=str, default='.')
parser.add_argument('--dataset', type=str, default='LF-AmazonTitles-131K')
parser.add_argument('--L_option', type=str, default='Lft_xrt', choices=['Lf', 'Lft', 'Lf_xrt','Lft_xrt','Lxrt'])
parser.add_argument('--Pk', type=int, default=5, help='Should be =< 20!!!')
parser.add_argument('--Use_A', type=int, default=0, help='Use true neighbor for training data')
parser.add_argument('--batch_size', type=int, default=256, help='batch size when applying XR-Transformer')
parser.add_argument('--num_workers', type=int, default=48, help='number of workers XR-Transformer')
parser.add_argument('--text_normalization', type=str, default="raw", help='Use raw or normalized text.')
args = parser.parse_args()
print(args)

# !!! only_topk =<20 !!!
topk = 5

feature_dir=f"{args.work_dir}/dataset/{args.dataset}"
params_path=f"{args.work_dir}/scripts/params/xtransformer/{args.dataset}/{args.model_name}.json"
model_dir=f"{args.work_dir}/models_LF/xtransformer/{args.dataset}/{args.model_name}/{args.feature_name}/XYstack"

# Remember to replace 20 with your own top k if you have modified it!
P_trn = smat.load_npz("{}/P.20.trn.npz".format(model_dir))
P_tst = smat.load_npz("{}/P.20.tst.npz".format(model_dir))

if len(args.L_option)>2 and args.L_option[-3:]=="xrt":
xtf = XTransformer.load(model_dir)

# use softmax row-wise to turn it into a probability. Since P contains negative values...
P_trn = smat_util.sorted_csr(P_trn,only_topk=topk)[:]
P_tst = smat_util.sorted_csr(P_tst,only_topk=topk)[:]

if P_trn.min()<0 or P_tst.min()<0:
P_trn = CSR_rowwise_softmax(P_trn)
P_tst = CSR_rowwise_softmax(P_tst)

N_trn = P_trn.shape[0]
print("{}/{}/Y_all.npz".format(feature_dir,args.text_normalization))
Y_trn = smat.load_npz("{}/{}/Y_all.npz".format(feature_dir,args.text_normalization))[:N_trn, :]

print(f"P_trn shape is {P_trn.shape}, max: {P_trn.max()}, min: {P_trn.min()}")
print(f"P_tst shape is {P_tst.shape}, max: {P_tst.max()}, min: {P_tst.min()}")
print(f"Y_trn shape is {Y_trn.shape}, max: {Y_trn.max()}, min: {Y_trn.min()}")


# Get features of pretraining XMC output space
if args.L_option == "Lft_xrt":
# Generate xrt dense embedding for label text
with open(f"{args.work_dir}/dataset/{args.dataset}/{args.text_normalization}/output-items.txt") as f:
text = f.readlines()
L_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

# Generate xrt dense embedding for instance text
with open("{}/dataset/{}/{}/X.trn.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
X_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

# Prepare [L|X] for dense embedding.
L_emb = smat.csr_matrix(L_emb,dtype=np.float32)
X_emb = smat.csr_matrix(X_emb,dtype=np.float32)
All_emb = smat_util.vstack_csr([L_emb,X_emb])
# Row normalization
All_emb = sklearn.preprocessing.normalize(All_emb,norm='l2')

# Load stacked instance feature
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)

X_all = sklearn.preprocessing.normalize(X_all,norm='l2')

# Concat sparse and dense embedding
Lf1 = smat_util.hstack_csr([X_all,All_emb])

elif args.L_option == "Lf_xrt":
# Use PIFA
# # Load stacked instance feature and multilabel matrix
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)
X_all = sklearn.preprocessing.normalize(X_all,norm='l2')
Y_all = smat.load_npz('{}/dataset/{}/{}/Y_all.npz'.format(args.work_dir,args.dataset,args.text_normalization)).astype(np.float32)

# Produce PIFA embedding
Lf1 = LabelEmbeddingFactory.create(Y_all, X_all, method="pifa")

# Generate xrt dense embedding for label text and instnace text
with open("{}/dataset/{}/{}/output-items.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
L_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)
with open("{}/dataset/{}/{}/X.trn.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
X_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

L_emb = smat.csr_matrix(L_emb,dtype=np.float32)
X_emb = smat.csr_matrix(X_emb,dtype=np.float32)
All_emb = smat_util.vstack_csr([L_emb,X_emb])
# Row normalization
All_emb = sklearn.preprocessing.normalize(All_emb,norm='l2')
# Concat, X|L
Lf1 = smat_util.hstack_csr([All_emb,Lf1])

elif args.L_option == "Lft":
# Load stacked instance feature
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)

Lf1 = sklearn.preprocessing.normalize(X_all,norm='l2')

elif args.L_option == "Lf":
# Load stacked instance feature
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)

X_all = sklearn.preprocessing.normalize(X_all,norm='l2')
Y_all = smat.load_npz('{}/dataset/{}/{}/Y_all.npz'.format(args.work_dir,args.dataset,args.text_normalization)).astype(np.float32)

# Produce PIFA embedding
Lf1 = LabelEmbeddingFactory.create(Y_all, X_all, method="pifa")

elif args.L_option == "Lxrt":
# Generate xrt dense embedding for label text and instnace text
with open("{}/dataset/{}/{}/output-items.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
L_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)
with open("{}/dataset/{}/{}/X.trn.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
X_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

L_emb = smat.csr_matrix(L_emb,dtype=np.float32)
X_emb = smat.csr_matrix(X_emb,dtype=np.float32)
Lf1 = smat_util.vstack_csr([L_emb,X_emb])

else:
print("Not implemented")

# Apply row wise l2 normalization
Lf1 = sklearn.preprocessing.normalize(Lf1,norm='l2')
print(f"Feature shape for the pretraining XMC output space: {Lf1.shape}")

# Prepare PINA augmentation

# This allows for multi-hop generalization in the future...
Hops_trn = []
Hops_tst = []
Hops_true = []

# 0-Hop, also include xrt emb!!!
if args.feature_name in ['BoW']:
X_trn = smat.load_npz("{}/X_bow.trn.npz".format(feature_dir))
X_tst = smat.load_npz("{}/X_bow.tst.npz".format(feature_dir))
else:
X_trn = smat.load_npz('{}/X.tfidf.trn.npz'.format(feature_dir)).astype(np.float32)
X_tst = smat.load_npz('{}/X.tfidf.tst.npz'.format(feature_dir)).astype(np.float32)

with open(f"{args.work_dir}/dataset/{args.dataset}/{args.text_normalization}/X.trn.txt") as f:
text = f.readlines()
X_emb_trn = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

with open(f"{args.work_dir}/dataset/{args.dataset}/{args.text_normalization}/X.tst.txt") as f:
text = f.readlines()
X_emb_tst = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

X_emb_trn = smat.csr_matrix(X_emb_trn,dtype=np.float32)
X_emb_tst = smat.csr_matrix(X_emb_tst,dtype=np.float32)
X_emb_trn = sklearn.preprocessing.normalize(X_emb_trn,norm='l2')
X_emb_tst = sklearn.preprocessing.normalize(X_emb_tst,norm='l2')

X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')

X_trn = smat_util.hstack_csr([X_emb_trn,X_trn])
X_tst = smat_util.hstack_csr([X_emb_tst,X_tst])

X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')

Hops_trn.append(X_trn)
Hops_tst.append(X_tst)
Hops_true.append(X_trn)

# 1-Hop
X_trn = pecos_clib.sparse_matmul(P_trn,Lf1)
X_tst = pecos_clib.sparse_matmul(P_tst,Lf1)
X_true = pecos_clib.sparse_matmul(Y_trn,Lf1)

# Apply row wise l2 normalization
X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')
X_true = sklearn.preprocessing.normalize(X_true,norm='l2')

Hops_trn.append(X_trn)
Hops_tst.append(X_tst)
Hops_true.append(X_true)

# Concat all hops.
X_cat_trn = smat_util.hstack_csr(Hops_trn)
X_cat_tst = smat_util.hstack_csr(Hops_tst)
X_cat_true = smat_util.hstack_csr(Hops_true)

# Apply row wise l2 normalization
X_cat_trn = sklearn.preprocessing.normalize(X_cat_trn,norm='l2')
X_cat_tst = sklearn.preprocessing.normalize(X_cat_tst,norm='l2')
X_cat_true = sklearn.preprocessing.normalize(X_cat_true,norm='l2')

print(f"X_trn shape is {X_cat_trn.shape}, max: {X_cat_trn.max()}, min: {X_cat_trn.min()}")
print(f"X_tst shape is {X_cat_tst.shape}, max: {X_cat_tst.max()}, min: {X_cat_tst.min()}")
print(f"X_true shape is {X_cat_true.shape}, max: {X_cat_true.max()}, min: {X_cat_true.min()}")

smat.save_npz(f'{model_dir}/X_trn_P{args.Pk}{args.L_option}.npz',X_cat_trn.astype(np.float32))
smat.save_npz(f'{model_dir}/X_tst_P{args.Pk}{args.L_option}.npz',X_cat_tst.astype(np.float32))
smat.save_npz(f'{model_dir}/X_true_{args.L_option}.npz',X_cat_true.astype(np.float32))

print("All Set!!")

if __name__ == "__main__":
main()
Loading