Skip to content

Commit

Permalink
Add PINA example folder (#225)
Browse files Browse the repository at this point in the history
  • Loading branch information
jiong-zhang authored Jun 1, 2023
1 parent 480161b commit b9478e6
Show file tree
Hide file tree
Showing 29 changed files with 5,996 additions and 1 deletion.
65 changes: 65 additions & 0 deletions examples/pina/DataPrep_forXCrepo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import scipy.sparse as smat
import numpy as np
from pecos.utils import smat_util
import sklearn
import os
import sys
from xclib.data import data_utils

def main():
parser = argparse.ArgumentParser(description='DataPrep_forXCrepo')
parser.add_argument('--work_dir', type=str, default='.')
parser.add_argument('--dataset', type=str, default='LF-Amazon-131K')
args = parser.parse_args()
print(args)

cur_dir = f'{args.work_dir}/dataset/{args.dataset}'

if args.dataset in ['LF-Amazon-131K','LF-WikiSeeAlso-320K','LF-Amazon-1.3M']:
# Read files with features and labels (old format from XMLRepo)
features, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/train.txt')
features = features.astype(np.float32)
sklearn.preprocessing.normalize(features,copy=False)
smat.save_npz(f'{cur_dir}/X_bow.trn.npz',features)
smat.save_npz(f'{cur_dir}/normalized/Y.trn.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.trn.npz',tabels)

features, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/test.txt')
features = features.astype(np.float32)
sklearn.preprocessing.normalize(features,copy=False)
smat.save_npz(f'{cur_dir}/X_bow.tst.npz',features)
smat.save_npz(f'{cur_dir}/normalized/Y.tst.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.tst.npz',tabels)

TEST = data_utils.read_sparse_file(f'{cur_dir}/Yf.txt',header=True)
sklearn.preprocessing.normalize(TEST,copy=False)
smat.save_npz(f"{cur_dir}/Y_bow.npz",TEST)
elif args.dataset in ['LF-Wikipedia-500K']:
# Read files with labels (old format from XMLRepo)
_, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/train.txt')
smat.save_npz(f'{cur_dir}/normalized/Y.trn.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.trn.npz',tabels)
_, tabels, num_samples, num_features, num_labels = data_utils.read_data(f'{cur_dir}/test.txt')
smat.save_npz(f'{cur_dir}/normalized/Y.tst.npz',tabels)
smat.save_npz(f'{cur_dir}/raw/Y.tst.npz',tabels)

# Read files with features (BoW, dim = 500000. The feature in the old format has dim = 2381304.)
X_trn = data_utils.read_sparse_file(f"{cur_dir}/trn_X_Xf.txt", header=True)
X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')

X_tst = data_utils.read_sparse_file(f"{cur_dir}/tst_X_Xf.txt", header=True)
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')

L_bow = data_utils.read_sparse_file(f"{cur_dir}/lbl_X_Xf.txt", header=True)
L_bow = sklearn.preprocessing.normalize(L_bow,norm='l2')

smat.save_npz(f"{cur_dir}/Y_bow.npz",L_bow)
smat.save_npz(f"{cur_dir}/X_bow.trn.npz",X_trn)
smat.save_npz(f"{cur_dir}/X_bow.tst.npz",X_tst)
else:
raise ValueError(f'Dataset {args.dataset} is not supported yet!')


if __name__ == "__main__":
main()
42 changes: 42 additions & 0 deletions examples/pina/Ensemble-PINA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pecos.utils.smat_util import sorted_csr, CsrEnsembler, load_matrix, Metrics
import scipy.sparse as smat
import argparse
import os

def main():
parser = argparse.ArgumentParser(description='PrepareXYstack')
parser.add_argument('--work_dir', type=str, default='.')
parser.add_argument('--dataset', type=str, default='LF-Amazon-131K')
parser.add_argument('--model_name', type=str, default='v0')
parser.add_argument('--DS_model_names', type=str, default='v0,v0-s1,v0-s2', help="The DS_model_name should be seperated by ','. For example: 'v0,v0-s1,v0-s2'.")
parser.add_argument('--feature_name', type=str, default='BoW')
parser.add_argument('--ens_name', type=str, default='softmax', choices = ['rank', 'softmax', 'sigmoid'])
parser.add_argument('--L_option', type=str, default='Lft_xrt')
parser.add_argument('--Pk', type=str, default='5')
parser.add_argument('--Use_A', type=str, default='false')
args = parser.parse_args()
print(args)


feature_dir=f"{args.work_dir}/dataset/{args.dataset}"
TAGS = args.DS_model_names.split(',')
assert len(TAGS)>1 # Assume to ensemble at least 2 models!

P_paths = []
for tag in TAGS:
P_paths.append(f"{args.work_dir}/models_LF/xtransformer/{args.dataset}/{args.model_name}/{args.feature_name}/XYstack/downstream/{tag}/{args.Pk}/{args.L_option}/P.20.npz")

Y_true = sorted_csr(load_matrix(f"{args.work_dir}/dataset/{args.dataset}/raw/Y.tst.npz").tocsr())
Y_pred = [sorted_csr(load_matrix(pp).tocsr()) for pp in P_paths]
print("==== evaluation results ====")
ens = getattr(CsrEnsembler, f"{args.ens_name}_average")
cur_pred = ens(*Y_pred)
print(Metrics.generate(Y_true, cur_pred, topk=10))
PATH = f"{args.work_dir}/models_LF/xtransformer/{args.dataset}/{args.model_name}/{args.feature_name}/XYstack/downstream/{args.DS_model_names}/{args.Pk}/{args.L_option}"
os.makedirs(PATH,exist_ok=True)
smat.save_npz(f"{PATH}/P.20.{args.ens_name}.npz",cur_pred)
print("Ensembled P matrix saved!")
print(f"Saved model path: {PATH}")
print(f"To evaluate, please use this path with ./scripts/Ensemble_evaluations.sh")
if __name__ == "__main__":
main()
244 changes: 244 additions & 0 deletions examples/pina/PINA_augmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
from pecos.xmc.xtransformer.model import XTransformer
import scipy.sparse as smat
import numpy as np
from pecos.utils import smat_util
from pecos.utils.featurization.text.preprocess import Preprocessor
import sklearn
import os
from pecos.xmc import Indexer, LabelEmbeddingFactory
import sys
from pecos.core import clib as pecos_clib
from tqdm import tqdm
import argparse



def CSR_rowwise_softmax(P):
P.data = np.exp(P.data).astype(np.float32)
P = sklearn.preprocessing.normalize(P, norm='l1')
return P

def main():
parser = argparse.ArgumentParser(description='PrepareXYstack')
parser.add_argument('--model_name', type=str, required=True)
parser.add_argument('--feature_name', type=str, default='BoW')
parser.add_argument('--work_dir', type=str, default='.')
parser.add_argument('--dataset', type=str, default='LF-AmazonTitles-131K')
parser.add_argument('--L_option', type=str, default='Lft_xrt', choices=['Lf', 'Lft', 'Lf_xrt','Lft_xrt','Lxrt'])
parser.add_argument('--Pk', type=int, default=5, help='Should be =< 20!!!')
parser.add_argument('--Use_A', type=int, default=0, help='Use true neighbor for training data')
parser.add_argument('--batch_size', type=int, default=256, help='batch size when applying XR-Transformer')
parser.add_argument('--num_workers', type=int, default=48, help='number of workers XR-Transformer')
parser.add_argument('--text_normalization', type=str, default="raw", help='Use raw or normalized text.')
args = parser.parse_args()
print(args)

# !!! only_topk =<20 !!!
topk = 5

feature_dir=f"{args.work_dir}/dataset/{args.dataset}"
params_path=f"{args.work_dir}/scripts/params/xtransformer/{args.dataset}/{args.model_name}.json"
model_dir=f"{args.work_dir}/models_LF/xtransformer/{args.dataset}/{args.model_name}/{args.feature_name}/XYstack"

# Remember to replace 20 with your own top k if you have modified it!
P_trn = smat.load_npz("{}/P.20.trn.npz".format(model_dir))
P_tst = smat.load_npz("{}/P.20.tst.npz".format(model_dir))

if len(args.L_option)>2 and args.L_option[-3:]=="xrt":
xtf = XTransformer.load(model_dir)

# use softmax row-wise to turn it into a probability. Since P contains negative values...
P_trn = smat_util.sorted_csr(P_trn,only_topk=topk)[:]
P_tst = smat_util.sorted_csr(P_tst,only_topk=topk)[:]

if P_trn.min()<0 or P_tst.min()<0:
P_trn = CSR_rowwise_softmax(P_trn)
P_tst = CSR_rowwise_softmax(P_tst)

N_trn = P_trn.shape[0]
print("{}/{}/Y_all.npz".format(feature_dir,args.text_normalization))
Y_trn = smat.load_npz("{}/{}/Y_all.npz".format(feature_dir,args.text_normalization))[:N_trn, :]

print(f"P_trn shape is {P_trn.shape}, max: {P_trn.max()}, min: {P_trn.min()}")
print(f"P_tst shape is {P_tst.shape}, max: {P_tst.max()}, min: {P_tst.min()}")
print(f"Y_trn shape is {Y_trn.shape}, max: {Y_trn.max()}, min: {Y_trn.min()}")


# Get features of pretraining XMC output space
if args.L_option == "Lft_xrt":
# Generate xrt dense embedding for label text
with open(f"{args.work_dir}/dataset/{args.dataset}/{args.text_normalization}/output-items.txt") as f:
text = f.readlines()
L_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

# Generate xrt dense embedding for instance text
with open("{}/dataset/{}/{}/X.trn.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
X_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

# Prepare [L|X] for dense embedding.
L_emb = smat.csr_matrix(L_emb,dtype=np.float32)
X_emb = smat.csr_matrix(X_emb,dtype=np.float32)
All_emb = smat_util.vstack_csr([L_emb,X_emb])
# Row normalization
All_emb = sklearn.preprocessing.normalize(All_emb,norm='l2')

# Load stacked instance feature
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)

X_all = sklearn.preprocessing.normalize(X_all,norm='l2')

# Concat sparse and dense embedding
Lf1 = smat_util.hstack_csr([X_all,All_emb])

elif args.L_option == "Lf_xrt":
# Use PIFA
# # Load stacked instance feature and multilabel matrix
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)
X_all = sklearn.preprocessing.normalize(X_all,norm='l2')
Y_all = smat.load_npz('{}/dataset/{}/{}/Y_all.npz'.format(args.work_dir,args.dataset,args.text_normalization)).astype(np.float32)

# Produce PIFA embedding
Lf1 = LabelEmbeddingFactory.create(Y_all, X_all, method="pifa")

# Generate xrt dense embedding for label text and instnace text
with open("{}/dataset/{}/{}/output-items.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
L_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)
with open("{}/dataset/{}/{}/X.trn.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
X_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

L_emb = smat.csr_matrix(L_emb,dtype=np.float32)
X_emb = smat.csr_matrix(X_emb,dtype=np.float32)
All_emb = smat_util.vstack_csr([L_emb,X_emb])
# Row normalization
All_emb = sklearn.preprocessing.normalize(All_emb,norm='l2')
# Concat, X|L
Lf1 = smat_util.hstack_csr([All_emb,Lf1])

elif args.L_option == "Lft":
# Load stacked instance feature
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)

Lf1 = sklearn.preprocessing.normalize(X_all,norm='l2')

elif args.L_option == "Lf":
# Load stacked instance feature
if args.feature_name in ['BoW']:
X_all = smat.load_npz('{}/X_bow.all.npz'.format(feature_dir)).astype(np.float32)
else:
X_all = smat.load_npz('{}/X.tfidf.all.npz'.format(feature_dir)).astype(np.float32)

X_all = sklearn.preprocessing.normalize(X_all,norm='l2')
Y_all = smat.load_npz('{}/dataset/{}/{}/Y_all.npz'.format(args.work_dir,args.dataset,args.text_normalization)).astype(np.float32)

# Produce PIFA embedding
Lf1 = LabelEmbeddingFactory.create(Y_all, X_all, method="pifa")

elif args.L_option == "Lxrt":
# Generate xrt dense embedding for label text and instnace text
with open("{}/dataset/{}/{}/output-items.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
L_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)
with open("{}/dataset/{}/{}/X.trn.txt".format(args.work_dir,args.dataset,args.text_normalization)) as f:
text = f.readlines()
X_emb = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

L_emb = smat.csr_matrix(L_emb,dtype=np.float32)
X_emb = smat.csr_matrix(X_emb,dtype=np.float32)
Lf1 = smat_util.vstack_csr([L_emb,X_emb])

else:
print("Not implemented")

# Apply row wise l2 normalization
Lf1 = sklearn.preprocessing.normalize(Lf1,norm='l2')
print(f"Feature shape for the pretraining XMC output space: {Lf1.shape}")

# Prepare PINA augmentation

# This allows for multi-hop generalization in the future...
Hops_trn = []
Hops_tst = []
Hops_true = []

# 0-Hop, also include xrt emb!!!
if args.feature_name in ['BoW']:
X_trn = smat.load_npz("{}/X_bow.trn.npz".format(feature_dir))
X_tst = smat.load_npz("{}/X_bow.tst.npz".format(feature_dir))
else:
X_trn = smat.load_npz('{}/X.tfidf.trn.npz'.format(feature_dir)).astype(np.float32)
X_tst = smat.load_npz('{}/X.tfidf.tst.npz'.format(feature_dir)).astype(np.float32)

with open(f"{args.work_dir}/dataset/{args.dataset}/{args.text_normalization}/X.trn.txt") as f:
text = f.readlines()
X_emb_trn = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

with open(f"{args.work_dir}/dataset/{args.dataset}/{args.text_normalization}/X.tst.txt") as f:
text = f.readlines()
X_emb_tst = xtf.encode(text, batch_size=args.batch_size, batch_gen_workers=args.num_workers)

X_emb_trn = smat.csr_matrix(X_emb_trn,dtype=np.float32)
X_emb_tst = smat.csr_matrix(X_emb_tst,dtype=np.float32)
X_emb_trn = sklearn.preprocessing.normalize(X_emb_trn,norm='l2')
X_emb_tst = sklearn.preprocessing.normalize(X_emb_tst,norm='l2')

X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')

X_trn = smat_util.hstack_csr([X_emb_trn,X_trn])
X_tst = smat_util.hstack_csr([X_emb_tst,X_tst])

X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')

Hops_trn.append(X_trn)
Hops_tst.append(X_tst)
Hops_true.append(X_trn)

# 1-Hop
X_trn = pecos_clib.sparse_matmul(P_trn,Lf1)
X_tst = pecos_clib.sparse_matmul(P_tst,Lf1)
X_true = pecos_clib.sparse_matmul(Y_trn,Lf1)

# Apply row wise l2 normalization
X_trn = sklearn.preprocessing.normalize(X_trn,norm='l2')
X_tst = sklearn.preprocessing.normalize(X_tst,norm='l2')
X_true = sklearn.preprocessing.normalize(X_true,norm='l2')

Hops_trn.append(X_trn)
Hops_tst.append(X_tst)
Hops_true.append(X_true)

# Concat all hops.
X_cat_trn = smat_util.hstack_csr(Hops_trn)
X_cat_tst = smat_util.hstack_csr(Hops_tst)
X_cat_true = smat_util.hstack_csr(Hops_true)

# Apply row wise l2 normalization
X_cat_trn = sklearn.preprocessing.normalize(X_cat_trn,norm='l2')
X_cat_tst = sklearn.preprocessing.normalize(X_cat_tst,norm='l2')
X_cat_true = sklearn.preprocessing.normalize(X_cat_true,norm='l2')

print(f"X_trn shape is {X_cat_trn.shape}, max: {X_cat_trn.max()}, min: {X_cat_trn.min()}")
print(f"X_tst shape is {X_cat_tst.shape}, max: {X_cat_tst.max()}, min: {X_cat_tst.min()}")
print(f"X_true shape is {X_cat_true.shape}, max: {X_cat_true.max()}, min: {X_cat_true.min()}")

smat.save_npz(f'{model_dir}/X_trn_P{args.Pk}{args.L_option}.npz',X_cat_trn.astype(np.float32))
smat.save_npz(f'{model_dir}/X_tst_P{args.Pk}{args.L_option}.npz',X_cat_tst.astype(np.float32))
smat.save_npz(f'{model_dir}/X_true_{args.L_option}.npz',X_cat_true.astype(np.float32))

print("All Set!!")

if __name__ == "__main__":
main()
Loading

0 comments on commit b9478e6

Please sign in to comment.