diff --git a/examples/pecos-xrlinear-jmlr22/xrl_train.py b/examples/pecos-xrlinear-jmlr22/xrl_train.py index 818eb71..f73e993 100644 --- a/examples/pecos-xrlinear-jmlr22/xrl_train.py +++ b/examples/pecos-xrlinear-jmlr22/xrl_train.py @@ -112,22 +112,6 @@ def parse_arguments(): help="The max size of the leaf nodes of hierarchical 2-means clustering. If larger than total number of labels, One-Versus-All model will be trained. Default 100.", ) - parser.add_argument( - "--imbalanced-ratio", - type=float, - default=0.0, - metavar="FLOAT", - help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)", - ) - - parser.add_argument( - "--imbalanced-depth", - type=int, - default=100, - metavar="INT", - help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)", - ) - parser.add_argument( "--spherical", type=cli.str2bool, diff --git a/pecos/apps/text2text/train.py b/pecos/apps/text2text/train.py index e089ef0..44ba15f 100644 --- a/pecos/apps/text2text/train.py +++ b/pecos/apps/text2text/train.py @@ -113,22 +113,6 @@ def parse_arguments(args): help="number of splits used to construct hierarchy (a power of 2 is recommended, default 16)", ) - parser.add_argument( - "--imbalanced-ratio", - type=float, - default=0.0, - metavar="FLOAT", - help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)", - ) - - parser.add_argument( - "--imbalanced-depth", - type=int, - default=100, - metavar="INT", - help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)", - ) - parser.add_argument( "--label-embed-type", type=str, diff --git a/pecos/xmc/base.py b/pecos/xmc/base.py index ceebc24..edad390 100644 --- a/pecos/xmc/base.py +++ b/pecos/xmc/base.py @@ -30,7 +30,7 @@ XLINEAR_INFERENCE_MODEL_TYPES, ) from pecos.utils import smat_util -from pecos.utils.cluster_util import ClusterChain, hierarchical_kmeans +from pecos.utils.cluster_util import ClusterChain from sklearn.preprocessing import normalize LOGGER = logging.getLogger(__name__) @@ -86,11 +86,9 @@ class HierarchicalKMeans(Indexer): class TrainParams(pecos.BaseParams): # type: ignore """Training Parameters of Hierarchical K-means. - nr_splits (int, optional): The out-degree of each internal node of the tree. Ignored if `imbalanced_ratio != 0` because imbalanced clustering supports only 2-means. Default is `16`. + nr_splits (int, optional): The out-degree of each internal node of the tree. Default is `16`. min_codes (int): The number of direct child nodes that the top level of the hierarchy should have. max_leaf_size (int, optional): The maximum size of each leaf node of the tree. Default is `100`. - imbalanced_ratio (float, optional): Value between `0.0` and `0.5` (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering `L` labels, the size of the output 2 clusters will be within approx `imbalanced_ratio * 2 * L` of each other. Default is `0.0`. - imbalanced_depth (int, optional): Maximum depth of imbalanced clustering. After depth `imbalanced_depth` is reached, balanced clustering will be used. Default is `100`. spherical (bool, optional): True will l2-normalize the centroids of k-means after each iteration. Default is `True`. seed (int, optional): Random seed. Default is `0`. kmeans_max_iter (int, optional): Maximum number of iterations for each k-means problem. Default is `20`. @@ -100,8 +98,6 @@ class TrainParams(pecos.BaseParams): # type: ignore nr_splits: int = 16 min_codes: int = None # type: ignore max_leaf_size: int = 100 - imbalanced_ratio: float = 0.0 - imbalanced_depth: int = 100 spherical: bool = True seed: int = 0 kmeans_max_iter: int = 20 @@ -138,59 +134,45 @@ def gen( f"HierarchicalKMeans train_params: {json.dumps(train_params.to_dict(), indent=True)}" ) - # use optimized c++ clustering code if doing balanced clustering - if train_params.imbalanced_ratio == 0: - nr_instances = feat_mat.shape[0] - if train_params.max_leaf_size >= nr_instances: - # no-need to do clustering - return ClusterChain.from_partial_chain( - smat.csc_matrix(np.ones((nr_instances, 1), dtype=np.float32)) - ) - - depth = max(1, int(math.ceil(math.log2(nr_instances / train_params.max_leaf_size)))) - if (2**depth) > nr_instances: - raise ValueError( - f"max_leaf_size > 1 is needed for feat_mat.shape[0] == {nr_instances} to avoid empty clusters" - ) + nr_instances = feat_mat.shape[0] + if train_params.max_leaf_size >= nr_instances: + # no-need to do clustering + return ClusterChain.from_partial_chain( + smat.csc_matrix(np.ones((nr_instances, 1), dtype=np.float32)) + ) - algo = cls.SKMEANS if train_params.spherical else cls.KMEANS + depth = max(1, int(math.ceil(math.log2(nr_instances / train_params.max_leaf_size)))) + if (2**depth) > nr_instances: + raise ValueError( + f"max_leaf_size > 1 is needed for feat_mat.shape[0] == {nr_instances} to avoid empty clusters" + ) - assert feat_mat.dtype == np.float32 - if isinstance(feat_mat, (smat.csr_matrix, ScipyCsrF32)): - py_feat_mat = ScipyCsrF32.init_from(feat_mat) - elif isinstance(feat_mat, (np.ndarray, ScipyDrmF32)): - py_feat_mat = ScipyDrmF32.init_from(feat_mat) - else: - raise NotImplementedError( - "type(feat_mat) = {} is not supported.".format(type(feat_mat)) - ) + algo = cls.SKMEANS if train_params.spherical else cls.KMEANS - codes = np.zeros(py_feat_mat.rows, dtype=np.uint32) - codes = clib.run_clustering( - py_feat_mat, - depth, - algo, - train_params.seed, - codes=codes, - kmeans_max_iter=train_params.kmeans_max_iter, - threads=train_params.threads, - ) - C = cls.convert_codes_to_csc_matrix(codes, depth) - cluster_chain = ClusterChain.from_partial_chain( - C, min_codes=train_params.min_codes, nr_splits=train_params.nr_splits - ) + assert feat_mat.dtype == np.float32 + if isinstance(feat_mat, (smat.csr_matrix, ScipyCsrF32)): + py_feat_mat = ScipyCsrF32.init_from(feat_mat) + elif isinstance(feat_mat, (np.ndarray, ScipyDrmF32)): + py_feat_mat = ScipyDrmF32.init_from(feat_mat) else: - cluster_chain = hierarchical_kmeans( - feat_mat, - max_leaf_size=train_params.max_leaf_size, - imbalanced_ratio=train_params.imbalanced_ratio, - imbalanced_depth=train_params.imbalanced_depth, - spherical=train_params.spherical, - seed=train_params.seed, - kmeans_max_iter=train_params.kmeans_max_iter, - threads=train_params.threads, + raise NotImplementedError( + "type(feat_mat) = {} is not supported.".format(type(feat_mat)) ) - cluster_chain = ClusterChain(cluster_chain) + + codes = np.zeros(py_feat_mat.rows, dtype=np.uint32) + codes = clib.run_clustering( + py_feat_mat, + depth, + algo, + train_params.seed, + codes=codes, + kmeans_max_iter=train_params.kmeans_max_iter, + threads=train_params.threads, + ) + C = cls.convert_codes_to_csc_matrix(codes, depth) + cluster_chain = ClusterChain.from_partial_chain( + C, min_codes=train_params.min_codes, nr_splits=train_params.nr_splits + ) return cluster_chain @staticmethod diff --git a/pecos/xmc/xlinear/train.py b/pecos/xmc/xlinear/train.py index d4cf207..dea27f7 100644 --- a/pecos/xmc/xlinear/train.py +++ b/pecos/xmc/xlinear/train.py @@ -102,22 +102,6 @@ def parse_arguments(): help="The max size of the leaf nodes of hierarchical 2-means clustering. If larger than total number of labels, One-Versus-All model will be trained. Default 100.", ) - parser.add_argument( - "--imbalanced-ratio", - type=float, - default=0.0, - metavar="FLOAT", - help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)", - ) - - parser.add_argument( - "--imbalanced-depth", - type=int, - default=100, - metavar="INT", - help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)", - ) - parser.add_argument( "--spherical", type=cli.str2bool, diff --git a/pecos/xmc/xtransformer/train.py b/pecos/xmc/xtransformer/train.py index bc303f5..c22af9e 100644 --- a/pecos/xmc/xtransformer/train.py +++ b/pecos/xmc/xtransformer/train.py @@ -147,20 +147,6 @@ def parse_arguments(): metavar="INT", help="The max size of the leaf nodes of hierarchical clustering. If larger than the number of labels, OVA model will be trained. Default 100.", ) - parser.add_argument( - "--imbalanced-ratio", - type=float, - default=0.0, - metavar="FLOAT", - help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)", - ) - parser.add_argument( - "--imbalanced-depth", - type=int, - default=100, - metavar="INT", - help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)", - ) # ========= matcher parameters ============ parser.add_argument( "--max-match-clusters", diff --git a/test/pecos/xmc/test_xmc.py b/test/pecos/xmc/test_xmc.py index 8ad8038..151596c 100644 --- a/test/pecos/xmc/test_xmc.py +++ b/test/pecos/xmc/test_xmc.py @@ -30,7 +30,6 @@ def test_hierarchicalkmeans(): smat.csr_matrix([[1, 0], [0.95, 0.05], [0.9, 0.1], [0, 1]], dtype=np.float32) ) target_balanced = [0, 0, 1, 1] - target_imbalanced = [0, 0, 0, 1] balanced_chain = Indexer.gen(feat_mat, max_leaf_size=3) balanced_assignments = (balanced_chain[-1].todense() == [0, 1]).all(axis=1).A1 @@ -38,12 +37,6 @@ def test_hierarchicalkmeans(): ~balanced_assignments, target_balanced ) - imbalanced_chain = Indexer.gen(feat_mat, imbalanced_ratio=0.4, max_leaf_size=3) - imbalanced_assignments = (imbalanced_chain[-1].todense() == [0, 1]).all(axis=1).A1 - assert np.array_equal(imbalanced_assignments, target_imbalanced) or np.array_equal( - ~imbalanced_assignments, target_imbalanced - ) - chain2 = Indexer.gen(feat_mat, max_leaf_size=1, nr_splits=2) chain4 = Indexer.gen(feat_mat, max_leaf_size=1, nr_splits=4)