From a63f6b245aadb6e20fed4ed353a4f3a62cfdffa6 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 22 Oct 2025 23:16:08 +0200 Subject: [PATCH 1/7] add rwth_dbis learner models --- ...llm_learner_rwthdbis_taxonomy_discovery.py | 57 ++ examples/llm_learner_rwthdbis_term_typing.py | 50 ++ ontolearner/__init__.py | 6 +- ontolearner/learner/__init__.py | 2 + .../learner/taxonomy_discovery/__init__.py | 15 + .../learner/taxonomy_discovery/rwthdbis.py | 792 ++++++++++++++++++ ontolearner/learner/term_typing/__init__.py | 15 + ontolearner/learner/term_typing/rwthdbis.py | 255 ++++++ requirements.txt | 3 + 9 files changed, 1194 insertions(+), 1 deletion(-) create mode 100644 examples/llm_learner_rwthdbis_taxonomy_discovery.py create mode 100644 examples/llm_learner_rwthdbis_term_typing.py create mode 100644 ontolearner/learner/taxonomy_discovery/__init__.py create mode 100644 ontolearner/learner/taxonomy_discovery/rwthdbis.py create mode 100644 ontolearner/learner/term_typing/__init__.py create mode 100644 ontolearner/learner/term_typing/rwthdbis.py diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py new file mode 100644 index 0000000..fea5539 --- /dev/null +++ b/examples/llm_learner_rwthdbis_taxonomy_discovery.py @@ -0,0 +1,57 @@ +# Import core modules from the OntoLearner library +from ontolearner import LearnerPipeline, train_test_split +from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner + +# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery +ontology = ChordOntology() +ontology.load() # Read entities, type system, and taxonomic edges into memory + +# Extract typed taxonomic edges and split into train/test while preserving the structured shape +train_data, test_data = train_test_split( + ontology.extract(), + test_size=0.2, + random_state=42 +) + +# Initialize a supervised taxonomy classifier (encoder-based fine-tuning) +# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views +# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions +learner = RWTHDBISTaxonomyLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + gradient_accumulation_steps=4, + learning_rate=2e-5, + max_length=256, + seed=42, + negative_ratio=5, + bidirectional_templates=True, + context_json_path=None, + ontology_name=ontology.ontology_full_name, +) + +# Build the pipeline +pipeline = LearnerPipeline( + llm=learner, + llm_id=learner.model_name, + ontologizer_data=False, +) + +# # Run the full learning pipeline on the taxonomy-discovery task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs['elapsed_time']) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py new file mode 100644 index 0000000..67d207f --- /dev/null +++ b/examples/llm_learner_rwthdbis_term_typing.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import LearnerPipeline, train_test_split, AgrO +from ontolearner import RWTHDBISTermTypingLearner + +#load the AgrO ontology. +# AgrO provides term-typing supervision where each term can be annotated with one or more types. +ontology = AgrO() +ontology.load() +data = ontology.extract() + +# Split the labeled term-typing data into train and test sets +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# Configure a supervised encoder-based classifier for term typing. +# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results. +learner = RWTHDBISTermTypingLearner( + model_name="microsoft/deberta-v3-small", + output_dir="./results/deberta-v3", + num_train_epochs=30, + per_device_train_batch_size=16, + gradient_accumulation_steps=2, + learning_rate=2e-5, + max_length=64, + seed=42, +) + +# Build the pipeline and pass raw structured objects end-to-end. +pipeline = LearnerPipeline( + llm=learner, + llm_id=learner.model_name, + ontologizer_data=False, +) + +# Run the full learning pipeline on the term-typing task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs['elapsed_time']) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 237bee8..0b6fd26 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -29,7 +29,9 @@ AutoRetrieverLearner, AutoRAGLearner, StandardizedPrompting, - LabelMapper) + LabelMapper, + RWTHDBISTaxonomyLearner, + RWTHDBISTermTypingLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -47,6 +49,8 @@ "LabelMapper", "LearnerPipeline", "Processor", + "RWTHDBISTaxonomyLearner", + "RWTHDBISTermTypingLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 0baf580..ad38f0b 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -17,3 +17,5 @@ from .rag import AutoRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper +from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner +from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py new file mode 100644 index 0000000..ab5b4f8 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .rwthdbis import RWTHDBISSFTLearner diff --git a/ontolearner/learner/taxonomy_discovery/rwthdbis.py b/ontolearner/learner/taxonomy_discovery/rwthdbis.py new file mode 100644 index 0000000..47989c5 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/rwthdbis.py @@ -0,0 +1,792 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import random +import re +import time +import platform +import multiprocessing +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Callable +from functools import partial +from tqdm.auto import tqdm +import g4f +from g4f.client import Client as _G4FClient +import torch +from datasets import Dataset, DatasetDict +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + set_seed, +) + +from ...base import AutoLearner + +class RWTHDBISSFTLearner(AutoLearner): + """ + Supervised classifier for (parent, child) taxonomy edges. + + Model input format: + " ## " + + If no `context_json_path` is provided, the class precomputes a + context file ({ontology_name}_processed.json) directly from the ontology + object. + """ + + # Sentences containing any of these phrases are pruned from term_info. + _CONTEXT_REMOVALS = [ + "couldn't find any", + "does not require", + "assist you further", + "feel free to", + "I'm currently unable", + "the search results", + "I'm unable to", + "recommend referring directly", + "bear with me", + "searching for the most relevant information", + "I'm currently checking the most relevant", + "already in English", + "require further", + "any additional information", + "already an English", + "don't have information", + "I'm sorry,", + "For further exploration", + "For more detailed information", + ] + + def __init__( + self, + min_predictions: int = 1, + model_name: str = "distilroberta-base", + output_dir: str = "./results/{model_name}", + max_length: int = 256, + per_device_train_batch_size: int = 8, + gradient_accumulation_steps: int = 4, + num_train_epochs: int = 1, + learning_rate: float = 2e-5, + weight_decay: float = 0.01, + logging_steps: int = 25, + save_strategy: str = "epoch", + save_total_limit: int = 1, + fp16: bool = True, + bf16: bool = False, + seed: int = 42, + negative_ratio: int = 5, + bidirectional_templates: bool = True, + context_json_path: Optional[str] = None, + ontology_name: str = "Geonames" + ) -> None: + super().__init__() + + self.model_name = model_name + self.safe_model_name = model_name.replace("/", "__") + + resolved_output = output_dir.format(model_name=self.safe_model_name) + self.output_dir = str(Path(resolved_output)) + Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + self.min_predictions = int(min_predictions) + self.max_length = int(max_length) + self.per_device_train_batch_size = int(per_device_train_batch_size) + self.gradient_accumulation_steps = int(gradient_accumulation_steps) + self.num_train_epochs = float(num_train_epochs) + self.learning_rate = float(learning_rate) + self.weight_decay = float(weight_decay) + self.logging_steps = int(logging_steps) + self.save_strategy = str(save_strategy) + self.save_total_limit = int(save_total_limit) + self.fp16 = bool(fp16) + self.bf16 = bool(bf16) + self.seed = int(seed) + + self.negative_ratio = int(negative_ratio) + self.bidirectional_templates = bool(bidirectional_templates) + self.context_json_path = context_json_path + + self.ontology_name = ontology_name + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model: Optional[AutoModelForSequenceClassification] = None + self.tokenizer: Optional[AutoTokenizer] = None + + os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + os.environ.setdefault("WANDB_DISABLED", "true") + os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") + + self._context_exact: Dict[str, str] = {} # lower(term) -> info + self._context_rows: List[Dict[str, str]] = [] # [{'term': str, 'term_info': str}, ...] + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + return self._predict_pairs(data) if test else self._train_from_pairs(data) + + def _train_from_pairs(self, train_data: Any) -> None: + # Always (re)build context from ontology unless an explicit file is provided + if not self.context_json_path: + context_dir = Path(self.output_dir) / "context" + context_dir.mkdir(parents=True, exist_ok=True) + processed_context_file = context_dir / f"{self.ontology_name}_processed.json" + + # Remove stale file then regenerate + if processed_context_file.exists(): + try: + processed_context_file.unlink() + except Exception: + pass + + self.preprocess_context_from_ontology( + ontology=train_data, + processed_dir=context_dir, + dataset_name=self.ontology_name, + num_workers=max(1, min(os.cpu_count() or 2, 4)), + provider=partial(self._default_gpt_inference_with_dataset, dataset_name=self.ontology_name), + max_retries=5, + ) + + self.context_json_path = str(processed_context_file) + + # Reproducibility + set_seed(self.seed) + random.seed(self.seed) + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + + # Build labeled pairs from ontology; context comes from preprocessed map + positive_pairs = self._extract_positive_pairs(train_data) + if not positive_pairs: + raise ValueError("No positive (parent, child) pairs found in train_data.") + + entity_names = sorted({parent for parent, _ in positive_pairs} | {child for _, child in positive_pairs}) + negative_pairs = self._generate_negatives( + positives=positive_pairs, + entities=entity_names, + ratio=self.negative_ratio, + ) + + labels, texts = self._build_text_dataset(positive_pairs, negative_pairs) + + + datasets = DatasetDict({"train": Dataset.from_dict({"label": labels, "text": texts})}) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = ( + getattr(self.tokenizer, "eos_token", None) + or getattr(self.tokenizer, "sep_token", None) + or getattr(self.tokenizer, "cls_token", None) + ) + + def tokenize_batch(batch: Dict[str, List[str]]): + return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + + tokenized = datasets.map(tokenize_batch, batched=True, remove_columns=["text"]) + collator = DataCollatorWithPadding(self.tokenizer) + + self.model = AutoModelForSequenceClassification.from_pretrained( + self.model_name, + num_labels=2, + id2label={0: "incorrect", 1: "correct"}, + label2id={"incorrect": 0, "correct": 1}, + ) + if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + train_args = TrainingArguments( + output_dir=self.output_dir, + learning_rate=self.learning_rate, + per_device_train_batch_size=self.per_device_train_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + num_train_epochs=self.num_train_epochs, + weight_decay=self.weight_decay, + save_strategy=self.save_strategy, + save_total_limit=self.save_total_limit, + logging_steps=self.logging_steps, + dataloader_pin_memory = bool(torch.cuda.is_available()), + fp16=self.fp16, + bf16=self.bf16, + report_to="none", + save_safetensors=True, + ) + + trainer = Trainer( + model=self.model, + args=train_args, + train_dataset=tokenized["train"], + tokenizer=self.tokenizer, + data_collator=collator, + ) + trainer.train() + trainer.save_model(self.output_dir) + self.tokenizer.save_pretrained(self.output_dir) + + def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: + import torch.nn.functional as F + + self._ensure_loaded_for_inference() + + candidate_pairs = self._extract_pairs_for_eval(eval_data) + if not candidate_pairs: + return [] + + accepted: List[Dict[str, str]] = [] + scored_candidates: List[Tuple[float, str, str, int]] = [] + + self.model.eval() + with torch.no_grad(): + for parent_term, child_term in candidate_pairs: + input_text = self._format_input(parent_term, child_term) + inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, max_length=self.max_length) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + logits = self.model(**inputs).logits + probs = F.softmax(logits, dim=-1).squeeze(0) + p_positive = float(probs[1].item()) + predicted_label = int(torch.argmax(logits, dim=-1).item()) + scored_candidates.append((p_positive, parent_term, child_term, predicted_label)) + if predicted_label == 1: + accepted.append({"parent": parent_term, "child": child_term}) + + if accepted: + return accepted + + top_k = max(0, int(self.min_predictions)) + if top_k == 0: + return [] + scored_candidates.sort(key=lambda item: item[0], reverse=True) + return [{"parent": parent_term, "child": child_term} + for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k]] + + def _ensure_loaded_for_inference(self) -> None: + if self.model is not None and self.tokenizer is not None: + return + self.model = AutoModelForSequenceClassification.from_pretrained(self.output_dir).to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir) + if self.tokenizer.pad_token_id is None and getattr(self.model.config, "pad_token_id", None) is not None: + self.tokenizer.pad_token_id = self.model.config.pad_token_id + + def _load_context_map(self) -> None: + """Build exact and fuzzy maps from {ontology_name}_processed.json.""" + if not (self.context_json_path): + self._context_exact = {} + self._context_rows = [] + return + try: + rows = json.load(open(self.context_json_path, "r", encoding="utf-8")) + self._context_exact = { + str(row.get("term", "")).strip().lower(): str(row.get("term_info", "")).strip() + for row in rows + } + self._context_rows = [ + {"term": str(row.get("term", "")), "term_info": str(row.get("term_info", ""))} + for row in rows + ] + except Exception: + self._context_exact = {} + self._context_rows = [] + + def _lookup_context_info(self, raw_term: str) -> str: + """ + Loose context lookup: split by commas, strip whitespace, case-insensitive + substring match against any row['term']. Join hits with '.'. + """ + if not raw_term: + return "" + term_key = raw_term.strip().lower() + if term_key in self._context_exact: + return self._context_exact[term_key] + + subterms = [re.sub(r"\s+", "", piece) for piece in raw_term.split(",")] + matched_infos: List[str] = [] + for subterm in subterms: + if not subterm: + continue + lower_subterm = subterm.lower() + for row in self._context_rows: + if lower_subterm in row["term"].lower(): + info = row.get("term_info", "") + if info: + matched_infos.append(info) + break # one hit per subterm + return ".".join(matched_infos) + + def _extract_positive_pairs(self, ontology_obj: Any) -> List[Tuple[str, str]]: + """ + Read pairs from ontology_obj.type_taxonomies.taxonomies (or fallback to .taxonomies). + Each item must provide 'parent' and 'child' attributes/keys. + """ + type_taxonomies = getattr(ontology_obj, "type_taxonomies", None) + items = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology_obj, "taxonomies", None) + pairs: List[Tuple[str, str]] = [] + if items: + for item in items: + parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") + child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + if parent_term and child_term: + pairs.append((str(parent_term), str(child_term))) + return pairs + + def _extract_pairs_for_eval(self, ontology_obj: Any) -> List[Tuple[str, str]]: + candidate_pairs = getattr(ontology_obj, "pairs", None) + if candidate_pairs: + pairs: List[Tuple[str, str]] = [] + for item in candidate_pairs: + parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") + child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + if parent_term and child_term: + pairs.append((str(parent_term), str(child_term))) + return pairs + return self._extract_positive_pairs(ontology_obj) + + def _generate_negatives( + self, + positives: List[Tuple[str, str]], + entities: List[str], + ratio: int, + ) -> List[Tuple[str, str]]: + positive_set = set(positives) + all_possible = {(parent_term, child_term) for parent_term in entities for child_term in entities if parent_term != child_term} + negative_candidates = list(all_possible - positive_set) + + target_count = max(len(positive_set) * max(1, ratio), len(positive_set)) + sample_count = min(target_count, len(negative_candidates)) + return random.sample(negative_candidates, k=sample_count) if sample_count > 0 else [] + + def _build_text_dataset( + self, + positives: List[Tuple[str, str]], + negatives: List[Tuple[str, str]], + ) -> Tuple[List[int], List[str]]: + self._load_context_map() + + labels: List[int] = [] + input_texts: List[str] = [] + + def add_example(parent_term: str, child_term: str, label_value: int) -> None: + input_texts.append(self._format_input(parent_term, child_term)) + labels.append(label_value) + if self.bidirectional_templates: + input_texts.append(self._format_input(child_term, parent_term, reverse=True)) + labels.append(label_value) + + for parent_term, child_term in positives: + add_example(parent_term, child_term, 1) + for parent_term, child_term in negatives: + add_example(parent_term, child_term, 0) + + return labels, input_texts + + def _format_input(self, parent_term: str, child_term: str, reverse: bool = False) -> str: + relation_text = ( + f"{child_term} is a subclass / child / subtype / descendant class of {parent_term}" + if reverse + else f"{parent_term} is the superclass / parent / supertype / ancestor class of {child_term}" + ) + + parent_info = self._lookup_context_info(parent_term) + child_info = self._lookup_context_info(child_term) + if not parent_info and not child_info: + return relation_text + + context_text = f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + return f"{relation_text} {context_text}" + + @staticmethod + def _is_windows() -> bool: + return (os.name == "nt") or (platform.system().lower() == "windows") + + @staticmethod + def _default_gpt_inference_with_dataset(term: str, dataset_name: str) -> str: + """ + Generate a plain-text description for `term`, tailored by `dataset_name`. + Uses g4f if available; otherwise returns an empty string. + """ + prompt = ( + f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " + "Provide as detailed a definition of this term as possible in plain text.without any markdown format." + "No reference link in result. " + "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" + "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" + "Output: Plain text paragraphs only, neutral and factual." + f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." + ) + + try: + client = _G4FClient() + response = client.chat.completions.create( + model=g4f.models.default, + messages=[{"role": "user", "content": prompt}], + ) + raw_text = response.choices[0].message.content if response and response.choices else "" + except Exception: + raw_text = "" # or some deterministic fallback + + # Clean up + cleaned = re.sub(r"[\*\-\#]", "", raw_text) + cleaned = re.sub(r"\n\s*\n", " ", cleaned) + cleaned = cleaned.replace("\n", " ") + cleaned = re.sub(r"\s{2,}", " ", cleaned) + cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", cleaned) + sentences = [sentence for sentence in cleaned.split(".") if "?" not in sentence] + return ".".join(sentences).strip() + + @staticmethod + def _clean_term_info(raw_text: str) -> str: + """Normalize whitespace and remove link artifacts.""" + cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", str(raw_text)) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned + + @classmethod + def _merge_part_files(cls, dataset_name: str, merged_path: Path, part_paths: List[Path]) -> None: + merged_rows: List[dict] = [] + for part_path in part_paths: + try: + if not part_path.is_file(): + continue + part_content = json.load(open(part_path, "r", encoding="utf-8")) + if isinstance(part_content, list): + merged_rows.extend(part_content) + elif isinstance(part_content, dict): + merged_rows.append(part_content) + except Exception: + continue + + removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + for row in merged_rows: + term_info_raw = str(row.get("term_info", "")) + kept_sentences: List[str] = [] + for sentence in term_info_raw.split("."): + sentence_no_links = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence) + if any(marker in sentence_no_links for marker in removal_markers): + continue + kept_sentences.append(sentence_no_links) + row["term_info"] = cls._clean_term_info(".".join(kept_sentences)) + + merged_path.parent.mkdir(parents=True, exist_ok=True) + json.dump(merged_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + + # best-effort cleanup + for part_path in part_paths: + try: + os.remove(part_path) + except Exception: + pass + + @staticmethod + def _fill_bucket_threaded(bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str]) -> None: + start_index = 0 + try: + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + start_index = len(existing_rows) + except Exception: + pass + + for row_index in range(start_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) + except Exception: + bucket_rows[row_index]["term_info"] = "" + if row_index % 10 == 1: + json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + + json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + + @staticmethod + def _fill_bucket_process( + worker_id: int, + bucket_rows: List[dict], + output_path: Path, + provider: Callable[[str], str], + progress_map: "multiprocessing.managers.DictProxy", + ) -> None: + current_index = 0 + try: + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + current_index = len(existing_rows) + except Exception: + pass + + progress_map[worker_id] = current_index + + for row_index in range(current_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) + except Exception: + bucket_rows[row_index]["term_info"] = "" + progress_map[worker_id] = row_index + 1 + if row_index % 10 == 1: + json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + + json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) + progress_map[worker_id] = len(bucket_rows) + + @classmethod + def _execute_for_terms( + cls, + terms: List[str], + merged_path: Path, + part_paths: List[Path], + provider: Callable[[str], str], + dataset_name: str, + num_workers: int = 2, + ) -> None: + """ + Generate context for `terms`, writing shards to `part_paths`, then merge. + Threads on Windows; processes on POSIX. + """ + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + all_rows = [{"id": row_index, "term": term, "term_info": ""} for row_index, term in enumerate(terms)] + + buckets: List[List[dict]] = [[] for _ in range(worker_count)] + for reversed_index, row in enumerate(reversed(all_rows)): + buckets[reversed_index % worker_count].append(row) + + if cls._is_windows(): + total_rows = len(terms) + progress_bar = tqdm(total=total_rows, desc=f"{dataset_name} generation (threads)") + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + cls._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [pool.submit(run_bucket, buckets[bucket_index], part_paths[bucket_index]) + for bucket_index in range(worker_count)] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + else: + manager = multiprocessing.Manager() + progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) + + processes: List[multiprocessing.Process] = [] + for worker_index, bucket_rows in enumerate(buckets): + process = multiprocessing.Process( + target=cls._fill_bucket_process, + args=(worker_index, bucket_rows, part_paths[worker_index], provider, progress_map), + ) + processes.append(process) + process.start() + + total_rows = len(terms) + with tqdm(total=total_rows, desc=f"{dataset_name} generation") as progress_bar: + previous_total = 0 + while any(process.is_alive() for process in processes): + current_total = int(sum(progress_map.values())) + progress_bar.update(current_total - previous_total) + previous_total = current_total + time.sleep(0.5) + current_total = int(sum(progress_map.values())) + if current_total > previous_total: + progress_bar.update(current_total - previous_total) + + for process in processes: + process.join() + + cls._merge_part_files(dataset_name, merged_path, part_paths) + + @classmethod + def _re_infer_short_entries( + cls, + merged_path: Path, + re_part_paths: List[Path], + re_merged_path: Path, + provider: Callable[[str], str], + dataset_name: str, + num_workers: int, + ) -> int: + """ + Re-query terms with too-short term_info (< 50 chars). Returns remaining count. + """ + merged_rows = json.load(open(merged_path, "r", encoding="utf-8")) + + removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + short_rows: List[dict] = [] + long_rows: List[dict] = [] + + for row in merged_rows: + term_info_raw = str(row.get("term_info", "")) + sentences = term_info_raw.split(".") + for marker in removal_markers: + sentences = [sentence if marker not in sentence else "" for sentence in sentences] + filtered_info = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", ".".join(sentences)) + row["term_info"] = filtered_info + (short_rows if len(filtered_info) < 50 else long_rows).append(row) + + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + buckets: List[List[dict]] = [[] for _ in range(worker_count)] + for row_index, row in enumerate(short_rows): + buckets[row_index % worker_count].append(row) + + # clean old re-inference shards + for path in re_part_paths: + try: + os.remove(path) + except Exception: + pass + + total_candidates = len(short_rows) + if cls._is_windows(): + progress_bar = tqdm(total=total_candidates, desc=f"{dataset_name} re-inference (threads)") + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + cls._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [pool.submit(run_bucket, buckets[bucket_index], re_part_paths[bucket_index]) + for bucket_index in range(worker_count)] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + else: + manager = multiprocessing.Manager() + progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) + + processes: List[multiprocessing.Process] = [] + for worker_index, bucket_rows in enumerate(buckets): + process = multiprocessing.Process( + target=cls._fill_bucket_process, + args=(worker_index, bucket_rows, re_part_paths[worker_index], provider, progress_map), + ) + processes.append(process) + process.start() + + with tqdm(total=total_candidates, desc=f"{dataset_name} re-inference") as progress_bar: + previous_total = 0 + while any(process.is_alive() for process in processes): + current_total = int(sum(progress_map.values())) + progress_bar.update(current_total - previous_total) + previous_total = current_total + time.sleep(1) + if progress_bar.n < total_candidates: + progress_bar.update(total_candidates - progress_bar.n) + + for process in processes: + process.join() + + # merge and write back + cls._merge_part_files(dataset_name, re_merged_path, re_part_paths) + new_rows = json.load(open(re_merged_path, "r", encoding="utf-8")) if re_merged_path.is_file() else [] + final_rows = long_rows + new_rows + json.dump(final_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + + remaining_short = sum(1 for row in final_rows if len(str(row.get("term_info", ""))) < 50) + return remaining_short + + @staticmethod + def _extract_terms_from_ontology(ontology: Any) -> List[str]: + """ + Collect unique term names from ontology.type_taxonomies.taxonomies. + """ + type_taxonomies = getattr(ontology, "type_taxonomies", None) + taxonomies = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology, "taxonomies", None) + unique_terms: set[str] = set() + if taxonomies: + for row in taxonomies: + parent_term = getattr(row, "parent", None) if not isinstance(row, dict) else row.get("parent") + child_term = getattr(row, "child", None) if not isinstance(row, dict) else row.get("child") + if parent_term: + unique_terms.add(str(parent_term)) + if child_term: + unique_terms.add(str(child_term)) + return sorted(unique_terms) + + def preprocess_context_from_ontology( + self, + ontology: Any, + processed_dir: str | Path, + dataset_name: str = "GeoNames", + num_workers: int = 2, + provider: Optional[Callable[[str], str]] = None, + max_retries: int = 5, + ) -> Path: + """ + Build {id, term, term_info} from an ontology object. + Always regenerates {dataset_name}_processed.json. + """ + provider = provider or provider or partial(self._default_gpt_inference_with_dataset, dataset_name=dataset_name) + + processed_dir = Path(processed_dir) + processed_dir.mkdir(parents=True, exist_ok=True) + + merged_path = processed_dir / f"{dataset_name}_processed.json" + if merged_path.exists(): + try: + merged_path.unlink() + except Exception: + pass + + worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) + shard_paths = [processed_dir / f"{dataset_name}_type_part{shard_index}.json" for shard_index in range(worker_count)] + reinf_paths = [processed_dir / f"{dataset_name}_re_inference{shard_index}.json" for shard_index in range(worker_count)] + reinf_merged_path = processed_dir / f"{dataset_name}_Types_re_inference.json" + + # remove any leftover shards + for path in shard_paths + reinf_paths + [reinf_merged_path]: + try: + if path.exists(): + path.unlink() + except Exception: + pass + + unique_terms = self._extract_terms_from_ontology(ontology) + print(f"[Preprocess] Unique terms from ontology: {len(unique_terms)}") + + self._execute_for_terms( + terms=unique_terms, + merged_path=merged_path, + part_paths=shard_paths, + provider=provider, + dataset_name=dataset_name, + num_workers=worker_count, + ) + + retry_round = 0 + while retry_round < max_retries: + remaining_count = self._re_infer_short_entries( + merged_path=merged_path, + re_part_paths=reinf_paths, + re_merged_path=reinf_merged_path, + provider=provider, + dataset_name=dataset_name, + num_workers=worker_count, + ) + print(f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}") + retry_round += 1 + if remaining_count == 0: + break + + print(f"[Preprocess] Done. Merged context at: {merged_path}") + self.context_json_path = str(merged_path) + return merged_path diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py new file mode 100644 index 0000000..ab5b4f8 --- /dev/null +++ b/ontolearner/learner/term_typing/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .rwthdbis import RWTHDBISSFTLearner diff --git a/ontolearner/learner/term_typing/rwthdbis.py b/ontolearner/learner/term_typing/rwthdbis.py new file mode 100644 index 0000000..f27fd56 --- /dev/null +++ b/ontolearner/learner/term_typing/rwthdbis.py @@ -0,0 +1,255 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +from typing import Any, Dict, List, Optional, Tuple + +import torch +from datasets import Dataset, DatasetDict +from tqdm.auto import tqdm +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + set_seed, +) +from transformers import DebertaV2Tokenizer + +from ...base import AutoLearner + +class RWTHDBISSFTLearner(AutoLearner): + """ + Supervised term-typing + + Training expands multi-label examples into multiple single-label rows. + Inference returns: [{"term": "", "types": [""]}, ...] + """ + + def __init__( + self, + model_name: str = "microsoft/deberta-v3-small", + trained_model_path: Optional[str] = None, + output_dir: Optional[str] = None, + max_length: int = 64, + per_device_train_batch_size: int = 16, + gradient_accumulation_steps: int = 2, + num_train_epochs: int = 3, + learning_rate: float = 2e-5, + weight_decay: float = 0.01, + logging_steps: int = 50, + save_strategy: str = "epoch", + save_total_limit: int = 1, + fp16: bool = False, + bf16: bool = False, + seed: int = 42 + ) -> None: + super().__init__() + self.model_name = model_name + self.trained_model_path = trained_model_path + self.output_dir = output_dir or "./term_typing" + os.makedirs(self.output_dir, exist_ok=True) + + self.max_length = max_length + self.per_device_train_batch_size = per_device_train_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.num_train_epochs = num_train_epochs + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.logging_steps = logging_steps + self.save_strategy = save_strategy + self.save_total_limit = save_total_limit + self.fp16 = fp16 + self.bf16 = bf16 + self.seed = seed + + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model: Optional[AutoModelForSequenceClassification] = None + self.tokenizer: Optional[AutoTokenizer] = None + self.id2label: Dict[int, str] = {} + self.label2id: Dict[str, int] = {} + + def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: + """ + train: expects ontology-like object with .term_typings + test: returns List[{"term": str, "types": [str]}] (for evaluator) + """ + if not test: + return self._train_from_term_typings(train_data=data) + + terms = self._collect_eval_terms(data) + return self._predict_structured_output(terms) + + def _load_robust_tokenizer(self, backbone: str) -> AutoTokenizer: + try: + return AutoTokenizer.from_pretrained(backbone, use_fast=True) + except Exception as fast_err: + print(f"[tokenizer] Fast tokenizer failed: {fast_err}. Trying DebertaV2Tokenizer (slow)...") + + try: + return DebertaV2Tokenizer.from_pretrained(backbone) + except Exception as slow_err: + print(f"[tokenizer] DebertaV2Tokenizer failed: {slow_err}. Trying AutoTokenizer(use_fast=False)...") + + try: + return AutoTokenizer.from_pretrained(backbone, use_fast=False) + except Exception as final_err: + raise RuntimeError( + "Failed to load a tokenizer for this DeBERTa model.\n" + "Try:\n" + " - pip install --upgrade sentencepiece\n" + " - ensure network access for model files\n" + " - clear your HF cache and retry\n" + " - pin versions: transformers==4.43.*, tokenizers<0.20\n" + f"Original error: {final_err}" + ) + + def _expand_multilabel_training_rows( + self, term_typings: List[Any] + ) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]: + """ + From multi-label instances -> (texts, label_ids), and label maps. + """ + label_strings: List[str] = [] + for instance in term_typings: + label_strings.extend([str(label) for label in instance.types]) + + unique_labels = sorted(set(label_strings)) + id2label = {i: label for i, label in enumerate(unique_labels)} + label2id = {label: i for i, label in enumerate(unique_labels)} + + texts: List[str] = [] + label_ids: List[int] = [] + for instance in term_typings: + term_text = str(instance.term) + for label in instance.types: + texts.append(term_text) + label_ids.append(label2id[str(label)]) + + return texts, label_ids, id2label, label2id + + def _collect_eval_terms(self, eval_data: Any) -> List[str]: + """ + Accepts List[str] OR object with .term_typings; returns list of term texts. + """ + if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): + terms = eval_data + else: + term_typings = getattr(eval_data, "term_typings", None) + if term_typings is None: + raise ValueError("Provide a List[str] OR an object with .term_typings for test=True.") + terms = [str(instance.term) for instance in term_typings] + return terms + + def _train_from_term_typings(self, train_data: Any) -> None: + set_seed(self.seed) + random.seed(self.seed) + torch.manual_seed(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + + term_typings: List[Any] = getattr(train_data, "term_typings", None) + if term_typings is None: + raise ValueError("train_data must provide .term_typings for term-typing.") + + texts, label_ids, self.id2label, self.label2id = self._expand_multilabel_training_rows(term_typings) + + dataset = DatasetDict({"train": Dataset.from_dict({"labels": label_ids, "text": texts})}) + + backbone = self.trained_model_path or self.model_name + self.tokenizer = self._load_robust_tokenizer(backbone) + + def tokenize_batch(batch: Dict[str, List[str]]): + return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + + tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"]) + data_collator = DataCollatorWithPadding(self.tokenizer) + + self.model = AutoModelForSequenceClassification.from_pretrained( + backbone, + num_labels=len(self.id2label), + id2label=self.id2label, + label2id=self.label2id, + ) + + if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + training_args = TrainingArguments( + output_dir=self.output_dir, + learning_rate=self.learning_rate, + per_device_train_batch_size=self.per_device_train_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + num_train_epochs=self.num_train_epochs, + weight_decay=self.weight_decay, + save_strategy=self.save_strategy, + save_total_limit=self.save_total_limit, + logging_steps=self.logging_steps, + fp16=self.fp16, + bf16=self.bf16, + report_to=[], + ) + + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=tokenized["train"], + tokenizer=self.tokenizer, + data_collator=data_collator, + ) + + trainer.train() + trainer.save_model(self.output_dir) + self.tokenizer.save_pretrained(self.output_dir) + + def _ensure_loaded_for_inference(self) -> None: + if self.model is not None and self.tokenizer is not None: + return + model_path = self.trained_model_path or self.output_dir + self.model = AutoModelForSequenceClassification.from_pretrained(model_path) + self.tokenizer = self._load_robust_tokenizer(model_path) + + cfg = self.model.config + if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"): + self.id2label = dict(cfg.id2label) + self.label2id = dict(cfg.label2id) + + self.model.to(self.device).eval() + + def _predict_label_ids(self, terms: List[str]) -> List[int]: + self._ensure_loaded_for_inference() + predictions: List[int] = [] + for term_text in tqdm(terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"): + inputs = self.tokenizer(term_text, return_tensors="pt", truncation=True, max_length=self.max_length) + inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()} + with torch.no_grad(): + logits = self.model(**inputs).logits + predictions.append(int(torch.argmax(logits, dim=-1).item())) + return predictions + + def _predict_structured_output(self, terms: List[str]) -> List[Dict[str, List[str]]]: + """ + Convert predicted IDs into evaluator format: + [{"term": "", "types": [""]}, ...] + """ + label_ids = self._predict_label_ids(terms) + id2label_map = self.id2label or {} # fallback handled below + + results: List[Dict[str, List[str]]] = [] + for term_text, label_id in zip(terms, label_ids): + label_str = id2label_map.get(int(label_id), str(int(label_id))) + results.append({"term": term_text, "types": [label_str]}) + return results diff --git a/requirements.txt b/requirements.txt index 3ce19f7..6d71bd5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,6 @@ sentence-transformers~=5.1.0 scikit-learn~=1.6.1 bitsandbytes~=0.45.1 mistral-common[sentencepiece]~=1.8.5 +g4f +protobuf<5 +accelerate>=0.26.0 From 16457094ce35731b67f55b7f1bc27b5621242b20 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 29 Oct 2025 15:34:23 +0100 Subject: [PATCH 2/7] added skhnlp learner models --- ..._learner_skhnlp_sft_taxonomoy_discovery.py | 64 ++ ...m_learner_skhnlp_zs_taxonomoy_discovery.py | 50 ++ ontolearner/__init__.py | 6 +- ontolearner/learner/__init__.py | 1 + .../learner/taxonomy_discovery/__init__.py | 1 + .../learner/taxonomy_discovery/skhnlp.py | 761 ++++++++++++++++++ requirements.txt | 1 + 7 files changed, 883 insertions(+), 1 deletion(-) create mode 100644 examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py create mode 100644 examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py create mode 100644 ontolearner/learner/taxonomy_discovery/skhnlp.py diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py new file mode 100644 index 0000000..3661a5b --- /dev/null +++ b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py @@ -0,0 +1,64 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import SKHNLPSequentialFTLearner + +# Load ontology and split +# Load the GeoNames ontology for taxonomy discovery. +# GeoNames provides geographic parent-child relationships (is-a hierarchy). +ontology = GeoNames() +ontology.load() +data = ontology.extract() + +# Split the taxonomic relationships into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42 +) + +# Configure the learner with user-defined training args + device +# Configure the supervised BERT SFT Learner for taxonomy discovery. +# This fine-tunes BERT-Large using Sequential Prompts on (Parent, Child) pairs. +bert_learner = SKHNLPSequentialFTLearner( + model_name="bert-large-uncased", + n_prompts=2, + random_state=1403, + device="cpu", # Note: CPU training for BERT-Large is very slow. + output_dir="./results/", + num_train_epochs=1, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + warmup_steps=500, + weight_decay=0.01, + logging_dir="./logs/", + logging_steps=50, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) + +# Build pipeline and run +# Build the pipeline, passing the BERT Learner. +pipeline = LearnerPipeline( + llm=bert_learner, + llm_id="bert-large-uncased", + ontologizer_data=False, +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py new file mode 100644 index 0000000..90391f5 --- /dev/null +++ b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline, SKHNLPZSLearner + +#Load ontology and split data +# The GeoNames ontology provides geographic term types and relationships. +ontology = GeoNames() +ontology.load() +train_data, test_data = train_test_split( + ontology.extract(), + test_size=0.2, + random_state=42, +) + +# Configure the learner with user-defined generation and normalization settings +# Configure the Zero-Shot Qwen Learner for taxonomy discovery. +# This model uses a fixed prompt and string normalization (Levenshtein) to classify terms. +llm_learner = SKHNLPZSLearner( + model_name="Qwen/Qwen2.5-0.5B-Instruct", + device="cpu", # use "cuda" if you have a GPU + max_new_tokens=16, + save_path="./outputs/", # directory or full file path for CSV + verbose=True, + normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" +) + +# Build pipeline and run +pipe = LearnerPipeline( + llm=llm_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=False, + device="cpu", +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipe( + train_data=train_data, # zero-shot; ignored by the LLM learner + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 0b6fd26..d9ba608 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -31,7 +31,9 @@ StandardizedPrompting, LabelMapper, RWTHDBISTaxonomyLearner, - RWTHDBISTermTypingLearner) + RWTHDBISTermTypingLearner + SKHNLPZSLearner, + SKHNLPSequentialFTLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -51,6 +53,8 @@ "Processor", "RWTHDBISTaxonomyLearner", "RWTHDBISTermTypingLearner", + "SKHNLPZSLearner", + "SKHNLPSequentialFTLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index ad38f0b..3c56154 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -19,3 +19,4 @@ from .label_mapper import LabelMapper from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner +from .taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py index ab5b4f8..2c6b452 100644 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .rwthdbis import RWTHDBISSFTLearner +from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner diff --git a/ontolearner/learner/taxonomy_discovery/skhnlp.py b/ontolearner/learner/taxonomy_discovery/skhnlp.py new file mode 100644 index 0000000..fbe53b4 --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/skhnlp.py @@ -0,0 +1,761 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import random + +import pandas as pd +import torch +import Levenshtein +from datasets import Dataset +from typing import Any, Optional, List, Tuple, Dict +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + BertTokenizer, + BertForSequenceClassification, + pipeline, + Trainer, + TrainingArguments, +) + +from ...base import AutoLearner, AutoPrompt +from ...utils import taxonomy_split, train_test_split as ontology_split +from ...data_structure import OntologyData, TaxonomicRelation + +class SKHNLPTaxonomyPrompts(AutoPrompt): + """Builds the 7 taxonomy prompts used during fine-tuning / inference.""" + def __init__(self) -> None: + super().__init__(prompt_template="{parent} is the superclass of {child}. This statement is [MASK].") + self.templates: List[str] = [ + "{parent} is the superclass of {child}. This statement is [MASK].", + "{child} is a subclass of {parent}. This statement is [MASK].", + "{parent} is the parent class of {child}. This statement is [MASK].", + "{child} is a child class of {parent}. This statement is [MASK].", + "{parent} is a supertype of {child}. This statement is [MASK].", + "{child} is a subtype of {parent}. This statement is [MASK].", + "{parent} is an ancestor class of {child}. This statement is [MASK].", + ] + + def make(self, parent: str, child: str, template_idx: int) -> str: + return self.templates[template_idx].format(parent=parent, child=child) + + +class SKHNLPSequentialFTLearner(AutoLearner): + """ + BERT-based classifier for taxonomy discovery. + + With OntologyData: + * TRAIN: ontology-aware split; create balanced train/eval with negatives. + * PREDICT/TEST: notebook-style parent selection -> list[{'parent', 'child'}]. + + With DataFrame/list: + * TRAIN: taxonomy_split + negatives; build prompts and fine-tune. + * PREDICT/TEST: pairwise binary classification (returns label + score). + """ + + # Fixed constants defining data split size and the proportional mix of + # negative sample types (reversed vs. manipulated) for balancing. + _EVAL_FRACTION: float = 0.16 + _NEG_RATIO_REVERSED: float = 1/3 + _NEG_RATIO_MANIPULATED: float = 2/3 + + def __init__( + self, + # core + model_name: str = "bert-large-uncased", + n_prompts: int = 7, + random_state: int = 1403, + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + + # ---- expose TrainingArguments as individual user-defined args ---- + output_dir: str = "./results/", + num_train_epochs: int = 1, + per_device_train_batch_size: int = 4, + per_device_eval_batch_size: int = 4, + warmup_steps: int = 500, + weight_decay: float = 0.01, + logging_dir: str = "./logs/", + logging_steps: int = 50, + eval_strategy: str = "epoch", + save_strategy: str = "epoch", + load_best_model_at_end: bool = True, + ) -> None: + super().__init__() + self.model_name = model_name + self.n_prompts = n_prompts + self.random_state = random_state + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + self.tokenizer: Optional[BertTokenizer] = None + self.model: Optional[BertForSequenceClassification] = None + self.prompter = SKHNLPTaxonomyPrompts() + + # Candidate parents (unique parent list) for multi-class parent selection. + self._candidate_parents: Optional[List[str]] = None + + # Keep last train/eval tables for inspection + self._last_train: Optional[pd.DataFrame] = None + self._last_eval: Optional[pd.DataFrame] = None + + random.seed(self.random_state) + + # Build TrainingArguments from the individual user-defined values + self.training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_train_epochs, + per_device_train_batch_size=per_device_train_batch_size, + per_device_eval_batch_size=per_device_eval_batch_size, + warmup_steps=warmup_steps, + weight_decay=weight_decay, + logging_dir=logging_dir, + logging_steps=logging_steps, + eval_strategy=eval_strategy, + save_strategy=save_strategy, + load_best_model_at_end=load_best_model_at_end, + ) + + def load(self, model_id: Optional[str] = None, **_: Any) -> None: + """Load tokenizer and model; move model to the requested device.""" + model_id = model_id or self.model_name + self.tokenizer = BertTokenizer.from_pretrained(model_id) + self.model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2) + self.model.config.problem_type = "single_label_classification" + + # place on device chosen by user (or auto) + target_device = self.device + if target_device not in {"cuda", "cpu"}: + target_device = "cuda" if torch.cuda.is_available() else "cpu" + self.model.to(target_device) + + def tasks_ground_truth_former(self, data: Any, task: str) -> Any: + if task != "taxonomy-discovery": + return super().tasks_ground_truth_former(data, task) + + if isinstance(data, pd.DataFrame): + if "label" in data.columns: + return [ + {"parent": p, "child": c, "label": bool(lbl)} + for p, c, lbl in zip(data["parent"], data["child"], data["label"]) + ] + return [{"parent": p, "child": c} for p, c in zip(data["parent"], data["child"])] + + if isinstance(data, list): + return data + + return super().tasks_ground_truth_former(data, task) + + def _make_negatives(self, positives_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Return (reversed_df, manipulated_df).""" + unique_parents = positives_df["parent"].unique().tolist() + + def as_reversed(df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + out[["parent", "child"]] = out[["child", "parent"]].values + out["label"] = False + return out + + def with_random_parent(df: pd.DataFrame) -> pd.DataFrame: + def pick_other_parent(p: str) -> str: + pool = [x for x in unique_parents if x != p] + return random.choice(pool) if pool else p + out = df.copy() + out["parent"] = out["parent"].apply(pick_other_parent) + out["label"] = False + return out + + return as_reversed(positives_df), with_random_parent(positives_df) + + def _balance_with_negatives( + self, + positives_df: pd.DataFrame, + reversed_df: pd.DataFrame, + manipulated_df: pd.DataFrame, + ) -> pd.DataFrame: + """Combine positives and negatives with the same ratios as before.""" + n_pos = len(positives_df) + n_rev = int(n_pos * self._NEG_RATIO_REVERSED) + n_man = int(n_pos * self._NEG_RATIO_MANIPULATED) + + combined = pd.concat( + [ + positives_df.sample(n_pos, random_state=self.random_state), + reversed_df.sample(n_rev, random_state=self.random_state), + manipulated_df.sample(n_man, random_state=self.random_state), + ], + ignore_index=True, + ) + combined = combined.drop_duplicates(subset=["parent", "child", "label"]).reset_index(drop=True) + return combined + + def _add_prompt_columns(self, df: pd.DataFrame) -> pd.DataFrame: + out = df.copy() + for i in range(self.n_prompts): + out[f"prompt_{i+1}"] = out.apply( + lambda r, k=i: self.prompter.make(r["parent"], r["child"], k), axis=1 + ) + return out + + def _df_from_relations(relations: List[TaxonomicRelation], label: bool = True) -> pd.DataFrame: + if not relations: + return pd.DataFrame(columns=["parent", "child", "label"]) + return pd.DataFrame([{"parent": r.parent, "child": r.child, "label": label} for r in relations]) + + def _relations_from_df(df: pd.DataFrame) -> List[TaxonomicRelation]: + return [TaxonomicRelation(parent=p, child=c) for p, c in zip(df["parent"], df["child"])] + + def _build_masked_prompt(self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]") -> str: + prompts_1based = [ + f"{parent} is the superclass of {child}. This statement is {mask_token}.", + f"{child} is a subclass of {parent}. This statement is {mask_token}.", + f"{parent} is the parent class of {child}. This statement is {mask_token}.", + f"{child} is a child class of {parent}. This statement is {mask_token}.", + f"{parent} is a supertype of {child}. This statement is {mask_token}.", + f"{child} is a subtype of {parent}. This statement is {mask_token}.", + f"{parent} is an ancestor class of {child}. This statement is {mask_token}.", + f"{child} is a descendant classs of {child}. This statement is {mask_token}.", + f"\"{parent}\" is the superclass of \"{child}\". This statement is {mask_token}.", + ] + return prompts_1based[index_1_based - 1] + + @torch.no_grad() + def _predict_prompt_true_false(self, sentence: str) -> bool: + enc = self.tokenizer(sentence, return_tensors="pt").to(self.model.device) + logits = self.model(**enc).logits + predicted_label = torch.argmax(logits, dim=1).item() + return predicted_label == 1 + + def _select_parent_via_prompts(self, child: str) -> str: + assert self._candidate_parents, "Candidate parents not initialized." + scores: dict[str, int] = {p: 0 for p in self._candidate_parents} + + def prompt_indices_for_level(level: int) -> List[int]: + if level == 0: + return [1] + return [2 * level, 2 * level + 1] + + def recurse(active_parents: List[str], level: int) -> str: + idxs = [i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts] + if idxs: + for parent in active_parents: + votes = sum( + 1 + for idx in idxs + if self._predict_prompt_true_false( + self._build_masked_prompt(parent=parent, child=child, index_1_based=idx) + ) + ) + scores[parent] += votes + + max_score = max(scores[p] for p in active_parents) + tied = [p for p in active_parents if scores[p] == max_score] + if len(tied) == 1: + return tied[0] + if level < 4: + return recurse(tied, level + 1) + return random.choice(tied) + + return recurse(list(scores.keys()), level=0) + + def _taxonomy_discovery(self, data: Any, test: bool = False): + """ + TRAIN: + - OntologyData -> ontology-aware split; negatives per split; balanced sets. + - DataFrame/list -> taxonomy_split for positives; negatives proportional. + TEST: + - OntologyData -> parent selection: [{'parent': predicted, 'child': child}] + - DataFrame/list -> binary pair classification with 'label' + 'score' + """ + is_ontology_object = isinstance(data, OntologyData) + + # Normalize input + if isinstance(data, pd.DataFrame): + pairs_df = data.copy() + elif isinstance(data, list): + pairs_df = pd.DataFrame(data) + else: + gt_pairs = super().tasks_ground_truth_former(data, "taxonomy-discovery") + pairs_df = pd.DataFrame(gt_pairs) + if "label" not in pairs_df.columns: + pairs_df["label"] = True + + # Maintain candidate parents across calls + if "parent" in pairs_df.columns: + parents_in_call = sorted(pd.unique(pairs_df["parent"]).tolist()) + if test: + if self._candidate_parents is None: + self._candidate_parents = parents_in_call + else: + self._candidate_parents = sorted(set(self._candidate_parents).union(parents_in_call)) + else: + if self._candidate_parents is None: + self._candidate_parents = parents_in_call + + if test: + if is_ontology_object and self._candidate_parents: + predictions: List[dict[str, str]] = [] + for _, row in pairs_df.iterrows(): + child_term = row["child"] + chosen_parent = self._select_parent_via_prompts(child_term) + predictions.append({"parent": chosen_parent, "child": child_term}) + return predictions + + # pairwise binary classification + prompts_df = self._add_prompt_columns(pairs_df.copy()) + true_probs_by_prompt: List[torch.Tensor] = [] + + for i in range(self.n_prompts): + col = f"prompt_{i+1}" + enc = self.tokenizer( + prompts_df[col].tolist(), + return_tensors="pt", + padding=True, + truncation=True, + ).to(self.model.device) + with torch.no_grad(): + logits = self.model(**enc).logits + true_probs_by_prompt.append(torch.softmax(logits, dim=1)[:, 1]) + + avg_true_prob = torch.stack(true_probs_by_prompt, dim=0).mean(0) + predicted_bool = (avg_true_prob >= 0.5).cpu().tolist() + + results: List[dict[str, Any]] = [] + for p, c, s, yhat in zip( + pairs_df["parent"], pairs_df["child"], avg_true_prob.tolist(), predicted_bool + ): + results.append({"parent": p, "child": c, "label": int(bool(yhat)), "score": float(s)}) + return results + + if isinstance(data, OntologyData): + train_onto, eval_onto = ontology_split( + data, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + ) + + train_pos_rel: List[TaxonomicRelation] = getattr(train_onto.type_taxonomies, "taxonomies", []) or [] + eval_pos_rel: List[TaxonomicRelation] = getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + + train_pos_df = self._df_from_relations(train_pos_rel, label=True) + eval_pos_df = self._df_from_relations(eval_pos_rel, label=True) + + tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) + ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) + + train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) + eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + else: + if "label" not in pairs_df.columns or pairs_df["label"].nunique() == 1: + positives_df = pairs_df[pairs_df.get("label", True)][["parent", "child"]].copy() + pos_rel = self._relations_from_df(positives_df) + + tr_rel, ev_rel = taxonomy_split( + pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + ) + train_pos_df = self._df_from_relations(tr_rel, label=True) + eval_pos_df = self._df_from_relations(ev_rel, label=True) + + tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) + ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) + + train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) + eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + else: + positives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() + pos_rel = self._relations_from_df(positives_df) + + tr_rel, ev_rel = taxonomy_split( + pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + ) + train_pos_df = self._df_from_relations(tr_rel, label=True) + eval_pos_df = self._df_from_relations(ev_rel, label=True) + + negatives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() + negatives_df = negatives_df.sample(frac=1.0, random_state=self.random_state).reset_index(drop=True) + + n_eval_neg = max(1, int(len(negatives_df) * self._EVAL_FRACTION)) if len(negatives_df) > 0 else 0 + eval_neg_df = negatives_df.iloc[:n_eval_neg].copy() if n_eval_neg > 0 else negatives_df.iloc[:0].copy() + train_neg_df = negatives_df.iloc[n_eval_neg:].copy() + + train_neg_df["label"] = False + eval_neg_df["label"] = False + + train_df = pd.concat([train_pos_df, train_neg_df], ignore_index=True) + eval_df = pd.concat([eval_pos_df, eval_neg_df], ignore_index=True) + + train_df = self._add_prompt_columns(train_df) + eval_df = self._add_prompt_columns(eval_df) + + # Ensure labels are int64 + train_df["label"] = train_df["label"].astype("int64") + eval_df["label"] = eval_df["label"].astype("int64") + + # Sequential fine-tuning across prompts + for i in range(self.n_prompts): + prompt_col = f"prompt_{i+1}" + train_ds = Dataset.from_pandas(train_df[[prompt_col, "label"]].reset_index(drop=True)) + eval_ds = Dataset.from_pandas(eval_df[[prompt_col, "label"]].reset_index(drop=True)) + + train_ds = train_ds.rename_column("label", "labels") + eval_ds = eval_ds.rename_column("label", "labels") + + def tokenize_batch(batch): + return self.tokenizer(batch[prompt_col], padding="max_length", truncation=True) + + train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) + eval_ds = eval_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) + + train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + + trainer = Trainer( + model=self.model, + args=self.training_args, + train_dataset=train_ds, + eval_dataset=eval_ds, + ) + trainer.train() + + self._last_train = train_df + self._last_eval = eval_df + return None + + +class SKHNLPZSLearner(AutoLearner): + """ + Zero-shot taxonomy learner using an instruction-tuned causal LLM. + + Behavior + -------- + - Builds a fixed classification prompt listing 9 GeoNames parent classes. + - For each input row (child term), generates a short completion and parses + the predicted class from a strict '#[ ... ]#' format. + - Optionally normalizes the raw prediction to one of the valid 9 labels via: + * "none" : keep the parsed text as-is + * "substring" : snap to a label if either is a substring of the other + * "levenshtein" : snap to the closest label by edit distance + * "auto" : substring, then Levenshtein if needed + - Saves raw and normalized predictions to CSV if `save_path` is provided. + + Inputs the learner accepts (via `_to_dataframe`): + - pandas.DataFrame with columns: ['child', 'parent'] or ['child', 'parent', 'label'] + - list[dict] with keys: 'child', 'parent' (and optionally 'label') + - list of tuples/lists: (child, parent) or (child, parent, label) + - OntoLearner-style object exposing .type_taxonomies.taxonomies iterable with (child, parent) + """ + + # Fixed class inventory (GeoNames parents) + CLASS_LIST = [ + "city, village", + "country, state, region", + "forest, heath", + "mountain, hill, rock", + "parks, area", + "road, railroad", + "spot, building, farm", + "stream, lake", + "undersea", + ] + + # Strict format: #[ ... ]# + _PREDICTION_PATTERN = re.compile(r"#\[\s*([^\]]+?)\s*\]#") + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + max_new_tokens: int = 16, + save_path: Optional[str] = None, # directory or full path + verbose: bool = True, + normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" + random_state: int = 1403, + ) -> None: + super().__init__() + self.model_name = model_name + self.verbose = verbose + self.max_new_tokens = max_new_tokens + self.save_path = save_path + self.normalize_mode = (normalize_mode or "none").lower().strip() + self.random_state = random_state + + random.seed(self.random_state) + + # Device: auto-detect CUDA if not specified + if device is None: + self._has_cuda = torch.cuda.is_available() + else: + self._has_cuda = (device == "cuda") + self._pipe_device = 0 if self._has_cuda else -1 + self._model_device_map = {"": "cuda"} if self._has_cuda else None + + self._tokenizer = None + self._model = None + self._pipeline = None + + # Prompt template used for every example + self._classification_prompt = ( + "My task is classification. My classes are as follows: " + "(city, village), (country, state, region), (forest, heath), " + "(mountain, hill, rock), (parks, area), (road, railroad), " + "(spot, building, farm), (stream, lake), (undersea). " + 'I will provide you with a phrase like "wadi mouth". ' + "The name of each class is placed within a pair of parentheses. " + "I want you to choose the most appropriate class from those mentioned above " + "based on the given phrase and present it in a format like #[parks, area]#. " + "So, the general format for each response will be #[class name]#. " + "Pay attention to the format of the response. Start with a '#' character, " + "include the class name inside it, and end with another '#' character. " + "Additionally, make sure to include a '#' character at the end to indicate " + "that the answer is complete. I don't need any additional explanations." + ) + + def load(self, model_id: str = "") -> None: + """ + Load tokenizer, model, and text-generation pipeline. + """ + model_id = model_id or self.model_name + if self.verbose: + print(f"[ZeroShotTaxonomyLearner] Loading {model_id}") + + self._tokenizer = AutoTokenizer.from_pretrained(model_id) + + # Ensure a pad token is set for generation + if self._tokenizer.pad_token_id is None and self._tokenizer.eos_token_id is not None: + self._tokenizer.pad_token = self._tokenizer.eos_token + + self._model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=self._model_device_map, + torch_dtype="auto", + ) + + self._pipeline = pipeline( + task="text-generation", + model=self._model, + tokenizer=self._tokenizer, + device=self._pipe_device, # 0 for GPU, -1 for CPU + ) + + if self.verbose: + print("Device set to use", "cuda" if self._has_cuda else "cpu") + print("[ZeroShotTaxonomyLearner] Model loaded.") + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[List[Dict[str, str]]]: + """ + Zero-shot prediction over all incoming rows (no filtering/augmentation). + Returns a list of dictionaries: [{'parent': predicted_label, 'child': child}, ...] + """ + if not test: + if self.verbose: + print("[ZeroShot] Training skipped (zero-shot).") + return None + + df = self._to_dataframe(data) + + if self.verbose: + print(f"[ZeroShot] Incoming rows: {len(df)}; columns: {list(df.columns)}") + + eval_df = pd.DataFrame(df).reset_index(drop=True) + if eval_df.empty: + return [] + + # Prepare columns for inspection and saving + eval_df["prediction_raw"] = "" + eval_df["prediction_sub"] = "" + eval_df["prediction_lvn"] = "" + eval_df["prediction_auto"] = "" + eval_df["prediction"] = "" # final (per normalize_mode) + + # Generate predictions row by row + for idx, row in eval_df.iterrows(): + child_term = str(row["child"]) + raw_text, parsed_raw = self._generate_and_parse(child_term) + + # Choose a string to normalize (parsed token if matched, otherwise whole output) + basis = parsed_raw if parsed_raw != "unknown" else raw_text + + # Compute all normalization variants + sub_norm = self._normalize_substring_only(basis) + lvn_norm = self._normalize_levenshtein_only(basis) + auto_norm = self._normalize_auto(basis) + + # Final selection by mode + if self.normalize_mode == "none": + final_label = parsed_raw + elif self.normalize_mode == "substring": + final_label = sub_norm + elif self.normalize_mode == "levenshtein": + final_label = lvn_norm + elif self.normalize_mode == "auto": + final_label = auto_norm + else: + final_label = parsed_raw # fallback + + # Persist to DataFrame for inspection/export + eval_df.at[idx, "prediction_raw"] = parsed_raw + eval_df.at[idx, "prediction_sub"] = sub_norm + eval_df.at[idx, "prediction_lvn"] = lvn_norm + eval_df.at[idx, "prediction_auto"] = auto_norm + eval_df.at[idx, "prediction"] = final_label + + # Return in the format expected by the pipeline + return [{"parent": p, "child": c} for p, c in zip(eval_df["prediction"], eval_df["child"])] + + def _generate_and_parse(self, child_term: str) -> (str, str): + """ + Generate a completion for the given child term and extract the raw predicted class + using the strict '#[ ... ]#' pattern. + + Returns + ------- + (raw_generation_text, parsed_prediction_or_unknown) + """ + messages = [ + {"role": "system", "content": "You are a helpful classifier."}, + {"role": "user", "content": f"{self._classification_prompt} {child_term}"}, + ] + + prompt = self._tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + generation = self._pipeline( + prompt, + max_new_tokens=self.max_new_tokens, + do_sample=False, + temperature=0.0, + top_p=1.0, + eos_token_id=self._tokenizer.eos_token_id, + pad_token_id=self._tokenizer.eos_token_id, + return_full_text=False, + )[0]["generated_text"] + + match = self._PREDICTION_PATTERN.search(generation) + parsed = match.group(1).strip() if match else "unknown" + return generation, parsed + + # ------------------------------------------------------------------------- + # Normalization helpers + # ------------------------------------------------------------------------- + + def _normalize_substring_only(self, text: str) -> str: + """ + Snap to a label if the string is equal to / contained in / contains a valid label (case-insensitive). + """ + if not isinstance(text, str): + return "unknown" + lowered = text.strip().lower() + if not lowered: + return "unknown" + + for label in self.CLASS_LIST: + label_lower = label.lower() + if lowered == label_lower or lowered in label_lower or label_lower in lowered: + return label + return "unknown" + + def _normalize_levenshtein_only(self, text: str) -> str: + """ + Snap to the nearest label by Levenshtein (edit) distance. + """ + if not isinstance(text, str): + return "unknown" + lowered = text.strip().lower() + if not lowered: + return "unknown" + + best_label = None + best_distance = 10**9 + for label in self.CLASS_LIST: + label_lower = label.lower() + distance = Levenshtein.distance(lowered, label_lower) + if distance < best_distance: + best_distance = distance + best_label = label + return best_label or "unknown" + + def _normalize_auto(self, text: str) -> str: + """ + Cascade: try substring-first; if no match, fall back to Levenshtein snapping. + """ + snapped = self._normalize_substring_only(text) + return snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + + def _to_dataframe(data: Any) -> pd.DataFrame: + """ + Normalize various input formats into a DataFrame with columns: + ['child', 'parent'] or ['child', 'parent', 'label']. + """ + # Already a DataFrame + if isinstance(data, pd.DataFrame): + df = data.copy() + df.columns = [str(c).lower() for c in df.columns] + return df.reset_index(drop=True) + + # List[dict] + if isinstance(data, list) and data and isinstance(data[0], dict): + rows = [{str(k).lower(): v for k, v in d.items()} for d in data] + return pd.DataFrame(rows).reset_index(drop=True) + + # Iterable of tuples/lists: (child, parent[, label]) + if isinstance(data, (list, tuple)) and data: + first = data[0] + if isinstance(first, (list, tuple)) and not isinstance(first, dict): + n = len(first) + if n >= 3: + return pd.DataFrame(data, columns=["child", "parent", "label"]).reset_index(drop=True) + if n == 2: + return pd.DataFrame(data, columns=["child", "parent"]).reset_index(drop=True) + + # OntoLearner-style object (with .type_taxonomies.taxonomies) + try: + type_taxonomies = getattr(data, "type_taxonomies", None) + if type_taxonomies is not None: + taxonomies = getattr(type_taxonomies, "taxonomies", None) + if taxonomies is not None: + rows = [] + for rel in taxonomies: + parent = getattr(rel, "parent", None) + child = getattr(rel, "child", None) + label = getattr(rel, "label", None) if hasattr(rel, "label") else None + if parent is not None and child is not None: + rows.append({"child": child, "parent": parent, "label": label}) + if rows: + return pd.DataFrame(rows).reset_index(drop=True) + except Exception: + pass + + raise ValueError( + "Unsupported data format. Provide a DataFrame, a list of dicts, " + "a list of (child, parent[, label]) tuples/lists, or an object with " + ".type_taxonomies.taxonomies." + ) + + def _resolve_save_path(save_path: str, default_filename: str) -> str: + """ + If `save_path` is a directory, join it with `default_filename`. + If it's a file path, return as-is. + """ + base = os.path.basename(save_path) + has_ext = os.path.splitext(base)[1] != "" + return save_path if has_ext else os.path.join(save_path, default_filename) diff --git a/requirements.txt b/requirements.txt index 6d71bd5..28a92bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ mistral-common[sentencepiece]~=1.8.5 g4f protobuf<5 accelerate>=0.26.0 +Levenshtein From 844de4f0a0b6a2aa1240941fe6283fca1f0c52ed Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Wed, 29 Oct 2025 16:07:24 +0100 Subject: [PATCH 3/7] adding sbunlp learner models --- ...lm_learner_sbunlp_fs_taxonomy_discovery.py | 66 +++ examples/llm_learner_sbunlp_text2onto.py | 81 +++ examples/llm_learner_sbunlp_zs_term_typing.py | 55 ++ ontolearner/__init__.py | 10 +- ontolearner/learner/__init__.py | 3 + .../learner/taxonomy_discovery/__init__.py | 1 + .../learner/taxonomy_discovery/sbunlp.py | 317 +++++++++++ ontolearner/learner/term_typing/__init__.py | 1 + ontolearner/learner/term_typing/sbunlp.py | 400 +++++++++++++ ontolearner/learner/text2onto/__init__.py | 15 + ontolearner/learner/text2onto/sbunlp.py | 525 ++++++++++++++++++ 11 files changed, 1472 insertions(+), 2 deletions(-) create mode 100644 examples/llm_learner_sbunlp_fs_taxonomy_discovery.py create mode 100644 examples/llm_learner_sbunlp_text2onto.py create mode 100644 examples/llm_learner_sbunlp_zs_term_typing.py create mode 100644 ontolearner/learner/taxonomy_discovery/sbunlp.py create mode 100644 ontolearner/learner/term_typing/sbunlp.py create mode 100644 ontolearner/learner/text2onto/__init__.py create mode 100644 ontolearner/learner/text2onto/sbunlp.py diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py new file mode 100644 index 0000000..19797a9 --- /dev/null +++ b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py @@ -0,0 +1,66 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +# Import the specific Few-Shot Learner implementation +from ontolearner import SBUNLPFewShotLearner + +# Load ontology and split +# Load the GeoNames ontology for taxonomy discovery. +# GeoNames provides geographic parent-child relationships (is-a hierarchy). +ontology = GeoNames() +ontology.load() +data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object + +# Split the taxonomic relationships into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing (terms to find relations for) + random_state=42, +) + +# Configure the learner with user-defined inference args + device +# Configure the SBUNLP Few-Shot Learner using the Qwen model. +# This performs in-context learning via N x M batch prompting. +llm_learner = SBUNLPFewShotLearner( + # Model / decoding + model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency + max_new_tokens=140, # limit the length of the model's response (for JSON output) + max_input_tokens=1500, # limit the total prompt length (context window) + temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) + top_p=1.0, # top-p sampling disabled with temperature=0.0 + + # Grid settings (N x M prompts) + n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks + m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) + + # Run controls + limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run + output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging +) + +# Build pipeline and run +# Build the pipeline, passing the Few-Shot Learner. +pipe = LearnerPipeline( + llm=llm_learner, + llm_id=llm_learner.model_name, + ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers + device="auto", # automatically select CUDA or CPU +) + +# Run the full learning pipeline on the taxonomy-discovery task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=True, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for training + prediction + evaluation +print("Elapsed time:", outputs["elapsed_time"]) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py new file mode 100644 index 0000000..564f641 --- /dev/null +++ b/examples/llm_learner_sbunlp_text2onto.py @@ -0,0 +1,81 @@ +import os +import torch +#Import all the required classes +from ontolearner import SBUNLPText2OntoLearner +from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM + +# Local folder where the dataset is stored +# This path is relative to the directory where the script is executed +# (e.g., E:\OntoLearner\examples) +LOCAL_DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" + +# Ensure the base directories exist +# Creates the train and test subdirectories if they don't already exist. +os.makedirs(os.path.join(LOCAL_DATA_DIR, 'train'), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, 'test'), exist_ok=True) + +# Define local file paths: POINTING TO ALREADY SAVED FILES +# These files are used as input for the Fit and Predict phases. +DOCS_ALL_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/documents.jsonl" +TERMS2DOC_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/terms2docs.json" +DOCS_TEST_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/text2onto_ecology_test_documents.jsonl" + +# Output files for predictions (saved directly under LOCAL_DATA_DIR/test) +# These files will be created by the predict_terms/types methods. +TERMS_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" +TYPES_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" + +#Initialize and Load Learner --- +MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +# Determine the device for inference (GPU or CPU) +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Instantiate the underlying LLM helper +# (LocalAutoLLM handles model loading and generation) +llm_model_helper = LocalAutoLLM(device=DEVICE) + +# Instantiate the main learner class, passing the LLM helper to its constructor +learner = SBUNLPText2OntoLearner(model=llm_model_helper, device=DEVICE) + +# Load the model (This calls llm_model_helper.load) +LOAD_IN_4BIT = torch.cuda.is_available() +learner.model.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) + +# Build Few-Shot Exemplars (Fit Phase) +# The fit method uses the local data paths to build the in-context learning prompts. +learner.fit( + train_docs_jsonl=DOCS_ALL_PATH, + terms2doc_json=TERMS2DOC_PATH, + sample_size=28, + seed=123 # Seed for stratified random sampling stability +) + +MAX_NEW_TOKENS = 100 + +terms_written = learner.predict_terms( + docs_test_jsonl=DOCS_TEST_PATH, + out_jsonl=TERMS_PRED_OUT, + max_new_tokens=MAX_NEW_TOKENS +) +print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.") + +# Type Extraction subtask +types_written = learner.predict_types( + docs_test_jsonl=DOCS_TEST_PATH, + out_jsonl=TYPES_PRED_OUT, + max_new_tokens=MAX_NEW_TOKENS +) +print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.") + +try: + # Evaluate Term Extraction using the custom F1 function and gold data + f1_term = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TERMS_PRED_OUT, key="term") + print(f"Final Term Extraction F1: {f1_term:.4f}") + + # Evaluate Type Extraction + f1_type = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TYPES_PRED_OUT, key="type") + print(f"Final Type Extraction F1: {f1_type:.4f}") + +except Exception as e: + # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) + print(f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created.") diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py new file mode 100644 index 0000000..75d01da --- /dev/null +++ b/examples/llm_learner_sbunlp_zs_term_typing.py @@ -0,0 +1,55 @@ +# Import core modules from the OntoLearner library +from ontolearner import AgrO, train_test_split, LearnerPipeline +# Import the specific Zero-Shot Learner implementation for Term Typing +from ontolearner import SBUNLPZSLearner + +# Load ontology and split +# Load the AgrO ontology for type inventory and test data. +ontology = AgrO() +ontology.load() +data = ontology.extract() # Extract the full set of relationships/terms + +# Split the data into train (to learn type inventory) and test (terms to predict) +train_data, test_data = train_test_split( + data, + test_size=0.6, # 60% of data used for testing + random_state=42, +) + +# Configure the Qwen Zero-Shot learner (inference-only) +# This learner's 'fit' phase learns the vocabulary of allowed type labels. +llm_learner = SBUNLPZSLearner( + # Model / decoding + model_id="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + # device= is auto-detected + max_new_tokens=64, # Sufficient length for JSON list of types + temperature=0.0, # Ensures deterministic (greedy) output + # token= None, # Assuming public model access +) + +# Build pipeline and run +# Build the pipeline, passing the Zero-Shot Learner. +pipe = LearnerPipeline( + llm=llm_learner, + llm_id=llm_learner.model_id, + ontologizer_data=False, + device="cpu", # select CUDA or CPU +) + +# Run the full learning pipeline on the Term-Typing task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=False, +) + +# Display the evaluation results +print("Metrics:", outputs.get("metrics")) + +# Display total elapsed time for learning (type inventory) + prediction + evaluation +print("Elapsed time:", outputs.get("elapsed_time")) + +# Print all returned outputs (include predictions) +print(outputs) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index d9ba608..49b94c4 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -31,9 +31,12 @@ StandardizedPrompting, LabelMapper, RWTHDBISTaxonomyLearner, - RWTHDBISTermTypingLearner + RWTHDBISTermTypingLearner, SKHNLPZSLearner, - SKHNLPSequentialFTLearner) + SKHNLPSequentialFTLearner, + SBUNLPFewShotLearner, + SBUNLPZSLearner, + SBUNLPText2OntoLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -55,6 +58,9 @@ "RWTHDBISTermTypingLearner", "SKHNLPZSLearner", "SKHNLPSequentialFTLearner", + "SBUNLPFewShotLearner", + "SBUNLPZSLearner", + "SBUNLPText2OntoLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 3c56154..4f41586 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -20,3 +20,6 @@ from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner from .taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner +from .taxonomy_discovery.sbunlp import SBUNLPFewShotLearner +from .term_typing.sbunlp import SBUNLPZSLearner +from .text2onto import SBUNLPFewShotLearner as SBUNLPText2OntoLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py index 2c6b452..d52513b 100644 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -14,3 +14,4 @@ from .rwthdbis import RWTHDBISSFTLearner from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner +from .sbunlp import SBUNLPFewShotLearner diff --git a/ontolearner/learner/taxonomy_discovery/sbunlp.py b/ontolearner/learner/taxonomy_discovery/sbunlp.py new file mode 100644 index 0000000..9fc520d --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/sbunlp.py @@ -0,0 +1,317 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import json +import importlib.util +from typing import Any, Dict, List, Optional, Tuple + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +from ...base import AutoLearner + +class SBUNLPFewShotLearner(AutoLearner): + """ + Taxonomy-discovery via N×M batch prompting with a small Qwen model. + + Lifecycle + --------- + fit(): + Cache + clean training parent–child pairs. + predict(): + Chunk (train pairs × test terms), prompt per chunk pair, parse, merge, + and deduplicate predicted relations. + """ + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", + try_4bit: bool = True, + num_train_chunks: int = 7, + num_test_chunks: int = 7, + max_new_tokens: int = 140, + max_input_tokens: int = 1500, + temperature: float = 0.0, + top_p: float = 1.0, + limit_num_prompts: Optional[int] = None, + output_dir: Optional[str] = None, + **kwargs: Any, + ) -> None: + super().__init__(**kwargs) + self.model_name = model_name + self.try_4bit = try_4bit + + self.num_train_chunks = num_train_chunks + self.num_test_chunks = num_test_chunks + + self.max_new_tokens = max_new_tokens + self.max_input_tokens = max_input_tokens + self.temperature = temperature + self.top_p = top_p + self.limit_num_prompts = limit_num_prompts + + self.output_dir = output_dir + + self.tokenizer: Optional[AutoTokenizer] = None + self.model: Optional[AutoModelForCausalLM] = None + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + self.train_pairs_clean: List[Dict[str, str]] = [] + + # ----------------------- small helpers ---------------------- + def _clean_pairs(pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: + """ + Normalize, drop empty or self-relations, and deduplicate by (parent, child). + """ + cleaned_pairs: List[Dict[str, str]] = [] + seen_parent_child: set[Tuple[str, str]] = set() + + for pair_record in pair_rows or []: + if not isinstance(pair_record, dict): + continue + + parent_label = str(pair_record.get("parent", "")).strip() + child_label = str(pair_record.get("child", "")).strip() + if not parent_label or not child_label: + continue + + normalized_key = (parent_label.lower(), child_label.lower()) + if normalized_key[0] == normalized_key[1]: # parent==child + continue + if normalized_key in seen_parent_child: + continue + + seen_parent_child.add(normalized_key) + cleaned_pairs.append({"parent": parent_label, "child": child_label}) + + return cleaned_pairs + + def _chunk_list(items: List[Any], num_chunks: int) -> List[List[Any]]: + """ + Split `items` into `num_chunks` near-equal parts. Some chunks may be empty. + """ + if num_chunks <= 0: + return [items] + total_items = len(items) + base_size, remainder = divmod(total_items, num_chunks) + + chunks: List[List[Any]] = [] + start_index = 0 + for chunk_index in range(num_chunks): + current_size = base_size + (1 if chunk_index < remainder else 0) + end_index = start_index + current_size + chunks.append(items[start_index:end_index]) + start_index = end_index + return chunks + + def _ensure_dir(self, maybe_path: Optional[str]) -> None: + if maybe_path: + os.makedirs(maybe_path, exist_ok=True) + + # ---------------------- model load/gen ---------------------- + def load(self, **_: Any) -> None: + """ + Load tokenizer/model; use 4-bit nf4 on CUDA if available + requested. + """ + bnb_available = importlib.util.find_spec("bitsandbytes") is not None + use_4bit_quant = bool(self.try_4bit and bnb_available and self.device == "cuda") + + quant_config = None + if use_4bit_quant: + from transformers import BitsAndBytesConfig + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=("auto" if self.device == "cuda" else None), + torch_dtype=(torch.float16 if self.device == "cuda" else torch.float32), + quantization_config=quant_config, + ) + + def _format_chat(self, user_text: str) -> str: + """ + Wrap user text with the model's chat template (if present). + """ + if hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None): + return self.tokenizer.apply_chat_template( + [{"role": "user", "content": user_text}], + tokenize=False, + add_generation_prompt=True, + ) + return user_text + + @torch.no_grad() + def _generate(self, prompt_text: str) -> str: + """ + Single prompt → model text. Clips *input* tokens to avoid overflow. + """ + formatted_prompt = self._format_chat(prompt_text) + prompt_token_ids = self.tokenizer(formatted_prompt, add_special_tokens=False, return_tensors=None)["input_ids"] + if len(prompt_token_ids) > self.max_input_tokens: + prompt_token_ids = prompt_token_ids[-self.max_input_tokens:] + + prompt_tensor = torch.tensor([prompt_token_ids]).to(self.model.device) + + generation = self.model.generate( + input_ids=prompt_tensor, + max_new_tokens=self.max_new_tokens, + do_sample=(self.temperature > 0.0), + temperature=self.temperature, + top_p=self.top_p, + pad_token_id=self.tokenizer.pad_token_id, + eos_token_id=getattr(self.tokenizer, "eos_token_id", None), + use_cache=True, + ) + + decoded_full = self.tokenizer.decode(generation[0], skip_special_tokens=True) + decoded_prompt = self.tokenizer.decode(prompt_tensor[0], skip_special_tokens=True) + return decoded_full[len(decoded_prompt):].strip() if decoded_full.startswith(decoded_prompt) else decoded_full.strip() + + # ------------------ prompt build & parsing ------------------ + def _build_prompt(train_pairs_chunk: List[Dict[str, str]], + test_terms_chunk: List[str]) -> str: + """ + Few-shot with JSON examples + a block of test terms. + The model must return ONLY a JSON array of {parent, child}. + """ + examples_json = json.dumps(train_pairs_chunk, ensure_ascii=False, indent=2) + test_types_block = "\n".join(test_terms_chunk) + return ( + "From this file, extract all parent–child relations like in the examples.\n" + "Return ONLY a JSON array of objects with keys 'parent' and 'child'.\n" + "Output format:\n" + "[\n" + ' {"parent": "parent1", "child": "child1"},\n' + ' {"parent": "parent2", "child": "child2"}\n' + "]\n\n" + "EXAMPLES (JSON):\n" + f"{examples_json}\n\n" + "TEST TYPES (between [PAIR] tags):\n" + "[PAIR]\n" + f"{test_types_block}\n" + "[PAIR]\n" + "Return only JSON." + ) + + def _parse_pairs(model_text: str) -> List[Dict[str, str]]: + """ + Parse a model response into a list of {'parent','child'} dicts. + """ + def deduplicate_and_normalize(dict_list: List[Dict[str, str]]) -> List[Dict[str, str]]: + return SBUNLPFewShotLearner._clean_pairs(dict_list) + + response_text = model_text.strip() + + # 1) Direct JSON list + try: + maybe_json = json.loads(response_text) + if isinstance(maybe_json, list): + return deduplicate_and_normalize(maybe_json) + except Exception: + pass + + # 2) Find outermost [ ... ] and parse that + outer_list_match = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", response_text) + if outer_list_match: + try: + array_json = json.loads(outer_list_match.group(0)) + if isinstance(array_json, list): + return deduplicate_and_normalize(array_json) + except Exception: + pass + + # 3) Nothing parsable + return [] + + # --------------------- AutoLearner hooks -------------------- + def fit(self, train_data: Any, task: str, ontologizer: bool = True): + """ + Build the training example bank (parent–child pairs). + """ + if task != "taxonomy-discovery": + return super().fit(train_data, task, ontologizer) + + if ontologizer: + # Convert ontology object → list of {"parent","child"} gold pairs + gold_pairs_from_ontology = self.tasks_ground_truth_former( + train_data, task="taxonomy-discovery" + ) + self.train_pairs_clean = self._clean_pairs(gold_pairs_from_ontology) + else: + # Already a Python list of dicts + self.train_pairs_clean = self._clean_pairs(train_data) + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """ + Main prediction path. Returns a deduplicated list of relations. + """ + if not test: + return None + + if self.model is None or self.tokenizer is None: + self.load() + + # Build test vocabulary of types/terms + if isinstance(data, list) and (len(data) == 0 or isinstance(data[0], str)): + test_type_list: List[str] = data + else: + test_type_list = super().tasks_data_former( + data=data, task="taxonomy-discovery", test=True + ) + + # Create N×M grid + train_chunks = self._chunk_list(self.train_pairs_clean, self.num_train_chunks) + test_chunks = self._chunk_list(test_type_list, self.num_test_chunks) + + self._ensure_dir(self.output_dir) + + merged_predicted_pairs: List[Dict[str, str]] = [] + issued_prompt_count = 0 + + for train_chunk_index, train_pairs_chunk in enumerate(train_chunks, start=1): + for test_chunk_index, test_terms_chunk in enumerate(test_chunks, start=1): + issued_prompt_count += 1 + if self.limit_num_prompts and issued_prompt_count > self.limit_num_prompts: + break + + prompt_text = self._build_prompt(train_pairs_chunk, test_terms_chunk) + model_response = self._generate(prompt_text) + parsed_relation_pairs = self._parse_pairs(model_response) + + # Optional per-batch dump for debugging + if self.output_dir: + batch_json_path = os.path.join( + self.output_dir, f"pairs_T{train_chunk_index}_S{test_chunk_index}.json" + ) + with open(batch_json_path, "w", encoding="utf-8") as fp: + json.dump(parsed_relation_pairs, fp, ensure_ascii=False, indent=2) + + merged_predicted_pairs.extend(parsed_relation_pairs) + + if self.limit_num_prompts and issued_prompt_count >= (self.limit_num_prompts or 0): + break + + # Deduplicate final list + return self._clean_pairs(merged_predicted_pairs) diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py index ab5b4f8..ebd8cd9 100644 --- a/ontolearner/learner/term_typing/__init__.py +++ b/ontolearner/learner/term_typing/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .rwthdbis import RWTHDBISSFTLearner +from .sbunlp import SBUNLPZSLearner diff --git a/ontolearner/learner/term_typing/sbunlp.py b/ontolearner/learner/term_typing/sbunlp.py new file mode 100644 index 0000000..f838bd0 --- /dev/null +++ b/ontolearner/learner/term_typing/sbunlp.py @@ -0,0 +1,400 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional +import re + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from ...base import AutoLearner + +class SBUNLPZSLearner(AutoLearner): + """ + Qwen-based blind term typing learner (Task B), implemented as an AutoLearner. + + This class reproduces the notebook logic: + - Fit phase learns the *allowed type inventory* from training data. + - Predict phase performs blind prompting per term using the learned type list. + - Outputs are restricted to the allowed types and returned as [{"id", "types"}]. + + Expected I/O (recommended): + - fit(train_data, task="term-typing", ontologizer=True): + The framework's AutoLearner.tasks_data_former() provides a unique list of + type labels; we store it to `self.allowed_types`. + - predict(eval_data, task="term-typing", ontologizer=False): + Pass a list of dicts with keys {"id": str, "term": str} so IDs are preserved. + Returns a list of dicts [{"id": ..., "types": [...] }]. + """ + + def __init__( + self, + model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", + device: Optional[str] = None, + max_new_tokens: int = 64, + temperature: float = 0.0, + token: Optional[str] = None, + ) -> None: + """ + Args: + model_id: HF model id for Qwen. + device: "cuda", "mps", or "cpu". Auto-detected if None. + max_new_tokens: Generation cap per prompt. + temperature: Not used for greedy decoding (kept for future). + token: HF token if the model is gated. + """ + super().__init__() + + # Basic configuration + self.model_id = model_id + # default device detection: prefer CUDA if available + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.max_new_tokens = max_new_tokens + self.temperature = temperature + self.token = token + + # Model/tokenizer placeholders (populated by load()) + self.tokenizer: Optional[AutoTokenizer] = None + self.model: Optional[AutoModelForCausalLM] = None + + # Learned inventory of allowed type labels (populated by fit()) + self.allowed_types: List[str] = [] + + # Regex used to extract quoted strings from model output (e.g. "type") + self._quoted_re = re.compile(r'"([^"]+)"') + + def load(self, **kwargs: Any): + """ + Load Qwen model and tokenizer. + + NOTE: + - The HF arguments used here mirror your original code (`token=...`). + You may see a deprecation warning for `torch_dtype` (older transformers); + switching to `dtype=` is recommended but I did not change behavior here. + """ + # Respect overrides from kwargs if provided + model_id = kwargs.get("model_id", self.model_id) + token = kwargs.get("token", self.token) + + # Load tokenizer. If the model is gated, pass token (original code uses `token`). + # If your environment requires `use_auth_token=` replace here. + self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) + + # Ensure tokenizer has a pad token (some models omit it) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Device mapping for from_pretrained -> keep same behavior as original code + device_map = "auto" if self.device != "cpu" else "cpu" + # original code used torch_dtype; left as-is to avoid behavioral change + torch_dtype = torch.float16 if self.device != "cpu" else torch.float32 + + # Load the model weights. This can be heavy; keep same params as original. + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch_dtype, + token=token, + ) + return self + + # ------------------------------------------------------------------------- + # Fit / Predict interface + # ------------------------------------------------------------------------- + def fit(self, train_data: Any, task: str, ontologizer: bool = True): + """ + Learn the allowed type inventory from the training data. + + Expected behavior: + - If `tasks_data_former(..., test=False)` returns a list of strings, + set allowed_types to that list (deduped & sorted). + - If it returns a list of dicts (relationships), extract unique 'parent' + fields and use those as the allowed type inventory. + + This method contains a tolerant branch for the framework's custom container: + If the returned `train_fmt` is not a list but has a `.term_typings` attribute + (e.g., OntologyData object used by the framework), iterate that attribute + and collect any `types` values found. + """ + train_fmt = self.tasks_data_former(data=train_data, task=task, test=False) if ontologizer else train_data + if task != "term-typing": + raise ValueError("SBUNLPZSLearner only implements 'term-typing'.") + + # If framework passed a container with `.term_typings`, extract types from there + if not isinstance(train_fmt, list): + # handle OntologyData-like object with attribute 'term_typings' + if hasattr(train_fmt, "term_typings"): + try: + # term_typings is expected to be an iterable of objects with attribute `types` + collected = set() + for tt in getattr(train_fmt, "term_typings") or []: + # tt.types could be list[str] or a single str + if hasattr(tt, "types"): + tvals = tt.types + elif isinstance(tt, dict) and "types" in tt: + tvals = tt["types"] + else: + tvals = None + + # Normalize both list and single-string cases + if isinstance(tvals, (list, tuple, set)): + for x in tvals: + if isinstance(x, str): + collected.add(x) + elif isinstance(tvals, str): + collected.add(tvals) + + # If we successfully collected types, set allowed_types and return + if collected: + self.allowed_types = sorted(collected) + return self + # else fall through to error below (no types found) + except Exception: + # If anything unexpected occurs while iterating term_typings, + # gracefully fall through and raise the original TypeError below. + pass + + # not a supported non-list type -> keep original behavior (raise) + raise TypeError("For term-typing, expected a list of type labels at fit().") + + # At this point train_fmt is a list (original logic preserved) + if train_fmt and isinstance(train_fmt[0], dict) and "parent" in train_fmt[0]: + # Case A: Received raw relationships/pairs (e.g., from train_test_split). + # Extract unique parent types from the relationship records. + unique_types = set(r.get("parent") for r in train_fmt if r.get("parent")) + self.allowed_types = sorted(unique_types) + elif all(isinstance(x, str) for x in train_fmt): + # Case B: Received a clean list of type labels (List[str]). + self.allowed_types = sorted(set(train_fmt)) + else: + # The input is a list but not in either expected format -> raise + raise TypeError("For term-typing, input data format for fit() is invalid. Expected list of strings (types) or list of relationships (dicts).") + + return self + + def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: + """ + Predict types for each term. + + Expected inputs: + - With ontologizer=True: a list[str] of term strings (IDs are autogenerated). + - With ontologizer=False: a list[dict] where each dict has keys {'id','term'}. + + This method tolerantly converts common framework containers (e.g., an + OntologyData object exposing `.term_typings`) into the expected list[dict] + shape so that the internal _term_typing() can run unchanged. + """ + if task != "term-typing": + # Delegate to base for other tasks (not implemented here) + return super().predict(eval_data, task, ontologizer=ontologizer) + + def _extract_list_of_dicts_from_term_typings(obj) -> Optional[List[Dict[str, str]]]: + """ + Helper: try to produce a list of {"id","term"} dicts from objects + exposing a `term_typings` iterable. Supports either object-like + TermTyping (attributes) or dict-style entries. + """ + tts = getattr(obj, "term_typings", None) + if tts is None: + return None + out = [] + for tt in tts: + # support object-style TermTyping (attributes) and dict-style + if isinstance(tt, dict): + # try several common key names for ID + tid = tt.get("ID") or tt.get("id") or tt.get("Id") or tt.get("ID_") + tterm = tt.get("term") or tt.get("label") or tt.get("name") + else: + # object-style access + tid = getattr(tt, "ID", None) or getattr(tt, "id", None) or getattr(tt, "Id", None) + tterm = getattr(tt, "term", None) or getattr(tt, "label", None) or getattr(tt, "name", None) + if tid is None or tterm is None: + # skip malformed entry - this is defensive so downstream code has valid inputs + continue + out.append({"id": str(tid), "term": str(tterm)}) + return out if out else None + + # Case A: ontologizer=True -> framework often provides list[str] + if ontologizer: + if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): + # Simple case: convert list of terms to list of dicts with generated IDs + eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data)] + else: + # Try to extract from a framework container (e.g., OntologyData) + maybe = _extract_list_of_dicts_from_term_typings(eval_data) + if maybe is not None: + eval_pack = maybe + else: + # Last resort: if eval_data is some iterable of strings, convert it + try: + if hasattr(eval_data, "__iter__") and not isinstance(eval_data, (str, bytes)): + lst = list(eval_data) + if all(isinstance(x, str) for x in lst): + eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(lst)] + else: + raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") + else: + raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") + except TypeError: + # re-raise to preserve original error semantics + raise + # Delegate to internal inference routine + return self._term_typing(eval_pack, test=True) + + # Case B: ontologizer=False -> we expect list[dict], but tolerate common containers + else: + if isinstance(eval_data, list) and all(isinstance(x, dict) for x in eval_data): + eval_pack = eval_data + else: + # Try to extract from framework container (term_typings) + maybe = _extract_list_of_dicts_from_term_typings(eval_data) + if maybe is not None: + eval_pack = maybe + else: + # As a final attempt, allow eval_data to be a dict with a list under some known keys + if isinstance(eval_data, dict): + for key in ("term_typings", "terms", "items"): + if key in eval_data and isinstance(eval_data[key], (list, tuple)): + converted = [] + for x in eval_data[key]: + # Accept dict-style entries that include id and term/name + if isinstance(x, dict) and ("id" in x or "ID" in x) and ("term" in x or "name" in x): + tid = x.get("ID") or x.get("id") + tterm = x.get("term") or x.get("name") + converted.append({"id": str(tid), "term": str(tterm)}) + if converted: + eval_pack = converted + break + else: + # Could not convert; raise same TypeError as before + raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") + else: + # Not a supported container -> raise + raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") + # Delegate to internal inference routine + return self._term_typing(eval_pack, test=True) + + + # ------------------------------------------------------------------------- + # Internal task implementations (AutoLearner hooks) + # ------------------------------------------------------------------------- + def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: + """ + Core implementation: + - training mode (test=False): `data` is a list of allowed type labels -> store them. + - inference mode (test=True): `data` is a list of {"id","term"} -> produce [{"id","types"}]. + """ + if not test: + # training: expect a list of strings (type labels) + if not isinstance(data, list): + raise TypeError("Expected a list of type labels at training time.") + self.allowed_types = sorted(set(data)) + return None + + # Inference path + if not isinstance(data, list) or not all(isinstance(x, dict) for x in data): + raise TypeError("At prediction time, expected a list of {'id','term'} dicts.") + + # Ensure model and tokenizer are loaded + if self.model is None or self.tokenizer is None: + raise RuntimeError("Model/tokenizer not loaded. Call .load() before predict().") + + results = [] + for item in data: + # preserve incoming IDs and terms + term_id = item["id"] + term_text = item["term"] + + # build the blind JSON-prompt that instructs the model to output types + prompt = self._build_blind_prompt(term_id, term_text, self.allowed_types) + + # generate and parse model output into allowed types + types = self._generate_and_parse_types(prompt) + + # append result for this term (keep original id) + # include the original term so downstream evaluation (and any consumers) can match by term + results.append({"id": term_id, "term": term_text, "types": types}) + + return results + + # ------------------------------------------------------------------------- + # Prompting + parsing + # ------------------------------------------------------------------------- + + def _format_types_inline(allowed: List[str]) -> str: + """ + Format allowed types as comma-separated quoted strings for insertion into the prompt. + Example: '"type1", "type2", "type3"' + """ + return ", ".join(f'"{t}"' for t in allowed) + + def _build_blind_prompt(self, term_id: str, term: str, allowed_types: List[str]) -> str: + """ + Construct the prompt given a single term. The prompt: + - Instructs the model to produce a JSON array of {id, types} objects. + - Provides the allowed types list (so the model should only use those). + - Includes the single input item for which the model must decide types. + + Note: This is the same blind-prompting approach used in the original notebook. + """ + allowed_str = self._format_types_inline(allowed_types) + return ( + "Identify the type(s) of the term in a second JSON file.\n" + "A term can have more than one type.\n" + "Output file must be in this format:\n" + "[\n" + '{ "id": "TT_465e8904", "types": [ "type1" ] },\n' + '{ "id": "TT_01c7707e", "types": [ "type2", "type3" ] },\n' + '{ "id": "TT_b20cb478", "types": [ "type4" ] }\n' + "]\n" + "The id must be taken from the input JSON file.\n" + "You must find the type(s) for each term in the JSON file.\n" + "Types must be selected only from the types list.\n\n" + f"Types list: {allowed_str}\n\n" + f'{{ "id": "{term_id}", "term": "{term}" }}' + ) + + def _generate_and_parse_types(self, prompt: str) -> List[str]: + """ + Greedy generate, then extract quoted strings and filter by allowed types. + + Important details: + - We assert model/tokenizer presence before calling. + - Tokenized inputs are moved to the model device (original code uses .to(self.model.device)). + - The decoded text is scanned for quoted substrings using self._quoted_re. + - Only quoted strings that are present in self.allowed_types are kept. + - Returned list is unique & sorted for deterministic ordering. + """ + assert self.model is not None and self.tokenizer is not None + + # Tokenize prompt and move tensors to model device to avoid device mismatch + inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) + + with torch.no_grad(): + outputs = self.model.generate( + **inputs, + max_new_tokens=self.max_new_tokens, + do_sample=False, # deterministic (greedy) decoding + pad_token_id=self.tokenizer.eos_token_id, + ) + + # Decode full generated sequence (prompt + generation). Then extract quoted strings. + text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + candidates = self._quoted_re.findall(text) + + # Filter candidates to the allowed inventory + filtered = [c for c in candidates if c in self.allowed_types] + + # Return unique & sorted for stability across runs + return sorted(set(filtered)) diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py new file mode 100644 index 0000000..30e8372 --- /dev/null +++ b/ontolearner/learner/text2onto/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .sbunlp import SBUNLPFewShotLearner diff --git a/ontolearner/learner/text2onto/sbunlp.py b/ontolearner/learner/text2onto/sbunlp.py new file mode 100644 index 0000000..8ab617d --- /dev/null +++ b/ontolearner/learner/text2onto/sbunlp.py @@ -0,0 +1,525 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#      https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import random +import re +import ast +import gc +from typing import Any, Dict, List, Optional, Set, Tuple +from collections import defaultdict + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig + +from ...base import AutoLearner, AutoLLM + +# ----------------------------------------------------------------------------- +# Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface +# ----------------------------------------------------------------------------- +class LocalAutoLLM(AutoLLM): + """ + Handles loading and generation for a Hugging Face Causal Language Model (Qwen/TinyLlama). + Uses 4-bit quantization for efficiency and greedy decoding by default. + """ + + def __init__(self, label_mapper: Any = None, device: str = "cpu", token: str = "") -> None: + super().__init__(label_mapper=label_mapper, device=device, token=token) + self.model = None + self.tokenizer = None + + def load(self, model_id: str, load_in_4bit: bool = False, dtype: str = "auto", trust_remote_code: bool = True): + """Load tokenizer + model, applying 4-bit quantization if specified and possible.""" + + # Determine the target data type (default to float32 for CPU, float16 for GPU) + torch_dtype_val = (torch.float16 if torch.cuda.is_available() else torch.float32) + + # Load the tokenizer + self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + quant_config = None + if load_in_4bit: + # Configure BitsAndBytes for 4-bit loading + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + if torch_dtype_val is None: + torch_dtype_val = torch.float16 + + # Set device mapping (auto for multi-GPU or single GPU, explicit CPU otherwise) + device_map = "auto" if (self.device != "cpu") else {"": "cpu"} + + # Load the Causal Language Model + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch_dtype_val, + quantization_config=quant_config, + trust_remote_code=trust_remote_code, + ) + + # Ensure model is on the correct device (redundant if device_map="auto" but safe) + if self.device == "cpu": + self.model.to("cpu") + + def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: float = 0.0, top_p: float = 1.0) -> List[str]: + """Generate continuations for a list of prompts, returning only the generated part.""" + if self.model is None or self.tokenizer is None: + raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") + + # --- Generation Setup --- + # Tokenize batch (padding is essential for batch inference) + enc = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True) + input_ids = enc["input_ids"] + attention_mask = enc["attention_mask"] + + # Move tensors to the model's device (e.g., cuda:0) + model_device = next(self.model.parameters()).device + input_ids = input_ids.to(model_device) + attention_mask = attention_mask.to(model_device) + + # --- Generate --- + with torch.no_grad(): + outputs = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_new_tokens, + do_sample=(temperature > 0.0), # Use greedy decoding if temperature is 0.0 + temperature=temperature, + top_p=top_p, + pad_token_id=self.tokenizer.eos_token_id, + ) + + # --- Post-processing: Extract only the generated tail --- + decoded_outputs: List[str] = [] + for i, output_ids in enumerate(outputs): + full_decoded_text = self.tokenizer.decode(output_ids, skip_special_tokens=True) + prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True) + + # Safely strip the prompt text from the full output + if full_decoded_text.startswith(prompt_text): + generated_tail = full_decoded_text[len(prompt_text):].strip() + else: + # Fallback extraction (less robust if padding affects token indices) + prompt_len = input_ids.shape[1] + generated_tail = self.tokenizer.decode(output_ids[prompt_len:], skip_special_tokens=True).strip() + decoded_outputs.append(generated_tail) + + return decoded_outputs + +# ----------------------------------------------------------------------------- +# Main Learner: SBUNLPFewShotLearner (Task A Text2Onto) +# ----------------------------------------------------------------------------- +class SBUNLPFewShotLearner(AutoLearner): + """ + Concrete learner implementing the Task A Text2Onto pipeline (Term and Type Extraction). + It uses Few-Shot prompts generated from training data for inference. + """ + + def __init__(self, model: Optional[AutoLLM] = None, device: str = "cpu"): + super().__init__() + # self.model is an instance of LocalAutoLLM + self.model = model or LocalAutoLLM(device=device) + self.device = device + # Cached in-memory prompt blocks built during the fit phase + self.fewshot_terms_block: str = "" + self.fewshot_types_block: str = "" + + # --- Few-shot construction (terms) --- + def build_stratified_fewshot_prompt( + self, + documents_path: str, + terms_path: str, + sample_size: int = 28, + seed: int = 123, + max_chars_per_text: int = 1200, + ) -> str: + """ + Builds the few-shot exemplar block for Term Extraction using stratified sampling. + """ + random.seed(seed) + + # Read documents (JSONL) into a list + corpus_documents: List[Dict[str, Any]] = [] + with open(documents_path, "r", encoding="utf-8") as file_handle: + for line in file_handle: + if line.strip(): + corpus_documents.append(json.loads(line)) + + num_total_docs = len(corpus_documents) + num_sample_docs = min(sample_size, num_total_docs) + + # Load the map of term -> [list of document IDs] + with open(terms_path, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + # Invert map: document ID -> [list of terms] + doc_id_to_terms_map = defaultdict(list) + for term, doc_ids in term_to_doc_map.items(): + for doc_id in doc_ids: + doc_id_to_terms_map[doc_id].append(term) + + # Define strata (groups of documents associated with specific terms) + strata_map = defaultdict(list) + for doc in corpus_documents: + doc_id = doc.get("id", "") + associated_terms = doc_id_to_terms_map.get(doc_id, ["no_term"]) + for term in associated_terms: + strata_map[term].append(doc) + + # Perform proportional sampling across strata + sampled_documents: List[Dict[str, Any]] = [] + for term_str, stratum_docs in strata_map.items(): + num_stratum_docs = len(stratum_docs) + if num_stratum_docs == 0: + continue + + # Calculate proportional sample size + proportion = num_stratum_docs / num_total_docs + num_to_sample_from_stratum = int(num_sample_docs * proportion) + + if num_to_sample_from_stratum > 0: + sampled_documents.extend(random.sample(stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs))) + + # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size' + unique_docs_by_id = {} + for doc in sampled_documents: + unique_docs_by_id[doc.get("id", "")] = doc + + final_sample_docs = list(unique_docs_by_id.values()) + + if len(final_sample_docs) > num_sample_docs: + final_sample_docs = random.sample(final_sample_docs, num_sample_docs) + elif len(final_sample_docs) < num_sample_docs: + remaining_docs = [d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id] + needed_count = min(num_sample_docs - len(final_sample_docs), len(remaining_docs)) + final_sample_docs.extend(random.sample(remaining_docs, needed_count)) + + # Format the few-shot exemplar text block + prompt_lines: List[str] = [] + for doc in final_sample_docs: + doc_id = doc.get("id", "") + title = doc.get("title", "") + text = doc.get("text", "") + + # Truncate text if it exceeds the maximum character limit + if max_chars_per_text and len(text) > max_chars_per_text: + text = text[:max_chars_per_text] + "…" + + associated_terms = doc_id_to_terms_map.get(doc_id, []) + prompt_lines.append( + f"Document ID: {doc_id}\nTitle: {title}\nText: {text}\nAssociated Terms: {associated_terms}\n----------------------------------------" + ) + + prompt_block = "\n".join(prompt_lines) + self.fewshot_terms_block = prompt_block + return prompt_block + + # --- Few-shot construction (types) --- + def build_types_fewshot_block( + self, + docs_jsonl: str, + terms2doc_json: str, + sample_per_term: int = 1, + full_word: bool = True, + case_sensitive: bool = True, + max_chars_per_text: int = 800, + ) -> str: + """ + Builds the few-shot block for Type Extraction. + This method samples documents based on finding an associated term/type within the text. + """ + # Load documents into dict by ID + docs_by_id = {} + with open(docs_jsonl, "r", encoding="utf-8") as file_handle: + for line in file_handle: + line_stripped = line.strip() + if line_stripped: + try: + doc = json.loads(line_stripped) + doc_id = doc.get("id", "") + if doc_id: + docs_by_id[doc_id] = doc + except json.JSONDecodeError: + continue + + # Load term -> [doc_id,...] map + with open(terms2doc_json, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + flags = 0 if case_sensitive else re.IGNORECASE + prompt_lines: List[str] = [] + + # Iterate over terms (which act as types in this context) + for term, doc_ids in term_to_doc_map.items(): + escaped_term = re.escape(term) + # Create regex pattern for matching the term in the text + pattern = rf"\b{escaped_term}\b" if full_word else escaped_term + term_regex = re.compile(pattern, flags=flags) + + picked_count = 0 + for doc_id in doc_ids: + doc = docs_by_id.get(doc_id) + if not doc: + continue + + title = doc.get("title", "") + text = doc.get("text", "") + + # Check if the term/type is actually present in the document text/title + if term_regex.search(f"{title} {text}"): + text_content = text + + # Truncate text if necessary + if max_chars_per_text and len(text_content) > max_chars_per_text: + text_content = text_content[:max_chars_per_text] + "…" + + # Escape single quotes in the term for Python list formatting in the prompt + term_for_prompt = term.replace("'", "\\'") + + prompt_lines.append( + f"Document ID: {doc_id}\nTitle: {title}\nText: {text_content}\nAssociated Types: ['{term_for_prompt}']\n----------------------------------------" + ) + picked_count += 1 + + if picked_count >= sample_per_term: + break # Move to the next term + + prompt_block = "\n".join(prompt_lines) + self.fewshot_types_block = prompt_block + return prompt_block + + def fit(self, train_docs_jsonl: str, terms2doc_json: str, sample_size: int = 28, seed: int = 123) -> None: + """ + Fit phase: Builds and caches the few-shot prompt blocks from the training files. + No model training occurs (Few-Shot/In-Context Learning). + """ + # Build prompt block for Term extraction + _ = self.build_stratified_fewshot_prompt(train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed) + # Build prompt block for Type extraction + _ = self.build_types_fewshot_block(train_docs_jsonl, terms2doc_json, sample_per_term=1) + + # ------------------------- + # Inference helpers (prompt construction and output parsing) + # ------------------------- + def _build_term_prompt(self, example_block: str, title: str, text: str) -> str: + """Constructs the full prompt for Term Extraction.""" + return f"""{example_block} + [var] + Title: {title} + Text: {text} + [var] + Extract all relevant terms that could form the basis of an ontology from the above document. + Return ONLY a Python list like ['term1', 'term2', ...] and nothing else. + If no terms are found, return []. + """ + + def _build_type_prompt(self, example_block: str, title: str, text: str) -> str: + """Constructs the full prompt for Type Extraction.""" + return f"""{example_block} + [var] + Title: {title} + Text: {text} + [var] + Extract all relevant TYPES mentioned in the above document that could serve as ontology classes. + Only consider content inside the [var] ... [var] block. + Return ONLY a valid Python list like ['type1', 'type2'] and nothing else. If none, return []. + """ + + def _parse_list_like(self, raw_string: str) -> List[str]: + """Try to extract a Python list of strings from model output robustly.""" + processed_string = raw_string.strip() + if processed_string in ("[]", ""): + return [] + + # 1. Try direct evaluation + try: + parsed_value = ast.literal_eval(processed_string) + if isinstance(parsed_value, list): + # Filter to ensure only strings are returned + return [item for item in parsed_value if isinstance(item, str)] + except Exception: + pass + + # 2. Try finding and evaluating text within outermost brackets [ ... ] + bracket_match = re.search(r"\[[\s\S]*?\]", processed_string) + if bracket_match: + try: + parsed_value = ast.literal_eval(bracket_match.group(0)) + if isinstance(parsed_value, list): + return [item for item in parsed_value if isinstance(item, str)] + except Exception: + pass + + # 3. Fallback: Find comma-separated quoted substrings (less robust, but catches errors) + # Finds content inside either single quotes ('...') or double quotes ("...") + quoted_matches = re.findall(r"'([^']+)'|\"([^\"]+)\"", processed_string) + flattened_list = [a_match or b_match for a_match, b_match in quoted_matches] + return flattened_list + + def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str: + """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output.""" + # self.model is an instance of LocalAutoLLM + model_output = self.model.generate([prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0) + return model_output[0] if model_output else "" + + def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + """ + Runs Term Extraction on the test documents and saves results to a JSONL file. + Returns: The count of individual terms written. + """ + if not self.fewshot_terms_block: + raise RuntimeError("Few-shot block for terms is empty. Call fit() first.") + + num_written_terms = 0 + with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + for line_index, line in enumerate(file_in, start=1): + if 0 < max_lines < line_index: + break + + try: + document = json.loads(line.strip()) + except Exception: + continue # Skip malformed JSON lines + + doc_id = document.get("id", "unknown") + title = document.get("title", "") + text = document.get("text", "") + + # Construct and call model + prompt = self._build_term_prompt(self.fewshot_terms_block, title, text) + raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) + predicted_terms = self._parse_list_like(raw_output) + + # Write extracted terms + for term_or_type in predicted_terms: + if isinstance(term_or_type, str) and term_or_type.strip(): + file_out.write(json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + "\n") + num_written_terms += 1 + + # Lightweight memory management for long runs + if line_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return num_written_terms + + def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + """ + Runs Type Extraction on the test documents and saves results to a JSONL file. + Returns: The count of individual types written. + """ + if not self.fewshot_types_block: + raise RuntimeError("Few-shot block for types is empty. Call fit() first.") + + num_written_types = 0 + with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + for line_index, line in enumerate(file_in, start=1): + if 0 < max_lines < line_index: + break + + try: + document = json.loads(line.strip()) + except Exception: + continue # Skip malformed JSON lines + + doc_id = document.get("id", "unknown") + title = document.get("title", "") + text = document.get("text", "") + + # Construct and call model using the dedicated type prompt block + prompt = self._build_type_prompt(self.fewshot_types_block, title, text) + raw_output = self._call_model_one(prompt, max_new_tokens=max_new_tokens) + predicted_types = self._parse_list_like(raw_output) + + # Write extracted types + for term_or_type in predicted_types: + if isinstance(term_or_type, str) and term_or_type.strip(): + file_out.write(json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + "\n") + num_written_types += 1 + + if line_index % 50 == 0: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return num_written_types + + # --- Evaluation utilities (unchanged from prior definition, added docstrings) --- + def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]: + """Convert terms2docs JSON into a set of unique (doc_id, term) pairs, lowercased.""" + gold_pairs = set() + with open(terms2doc_path, "r", encoding="utf-8") as file_handle: + term_to_doc_map = json.load(file_handle) + + for term, doc_ids in term_to_doc_map.items(): + clean_term = term.strip().lower() + for doc_id in doc_ids: + gold_pairs.add((doc_id, clean_term)) + return gold_pairs + + def load_predicted_pairs(self, predicted_jsonl_path: str, key: str = "term") -> Set[Tuple[str, str]]: + """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased.""" + predicted_pairs = set() + with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle: + for line in file_handle: + try: + entry = json.loads(line.strip()) + except Exception: + continue + doc_id = entry.get("doc_id") + value = entry.get(key) + if doc_id and value: + predicted_pairs.add((doc_id, value.strip().lower())) + return predicted_pairs + + def evaluate_extraction_f1(self, terms2doc_path: str, predicted_jsonl: str, key: str = "term") -> float: + """ + Computes set-based binary Precision, Recall, and F1 score against the gold pairs. + """ + # Load the ground truth and predictions + gold_set = self.load_gold_pairs(terms2doc_path) + predicted_set = self.load_predicted_pairs(predicted_jsonl, key=key) + + # Build combined universe of all pairs for score calculation + all_pairs = sorted(gold_set | predicted_set) + + # Create binary labels (1=present, 0=absent) + y_true = [1 if pair in gold_set else 0 for pair in all_pairs] + y_pred = [1 if pair in predicted_set else 0 for pair in all_pairs] + + # Use scikit-learn for metric calculation + from sklearn.metrics import precision_recall_fscore_support + precision, recall, f1, _ = precision_recall_fscore_support( + y_true, y_pred, average="binary", zero_division=0 + ) + + # Display results + num_true_positives = len(gold_set & predicted_set) + + print("\n📊 Evaluation Results:") + print(f" ✅ Precision: {precision:.4f}") + print(f" ✅ Recall: {recall:.4f}") + print(f" ✅ F1 Score: {f1:.4f}") + print(f" 📌 Gold pairs: {len(gold_set)}") + print(f" 📌 Predicted pairs:{len(predicted_set)}") + print(f" 🎯 True Positives: {num_true_positives}") + + return float(f1) From be80e735b2de7cbc48c2c5bfcb0c34b065c537a0 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Mon, 3 Nov 2025 23:09:36 +0100 Subject: [PATCH 4/7] alexbek learner models --- .../llm_learner_alexbek_rag_term_typing.py | 50 + .../llm_learner_alexbek_rf_term_typing.py | 54 + ...er_alexbek_self_attn_taxonomy_discovery.py | 41 + examples/llm_learner_alexbek_text2onto.py | 74 ++ ontolearner/__init__.py | 10 +- ontolearner/learner/__init__.py | 3 + .../learner/taxonomy_discovery/__init__.py | 1 + .../learner/taxonomy_discovery/alexbek.py | 305 +++++ ontolearner/learner/term_typing/__init__.py | 1 + ontolearner/learner/term_typing/alexbek.py | 809 ++++++++++++ ontolearner/learner/text2onto/__init__.py | 1 + ontolearner/learner/text2onto/alexbek.py | 1084 +++++++++++++++++ 12 files changed, 2432 insertions(+), 1 deletion(-) create mode 100644 examples/llm_learner_alexbek_rag_term_typing.py create mode 100644 examples/llm_learner_alexbek_rf_term_typing.py create mode 100644 examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py create mode 100644 examples/llm_learner_alexbek_text2onto.py create mode 100644 ontolearner/learner/taxonomy_discovery/alexbek.py create mode 100644 ontolearner/learner/term_typing/alexbek.py create mode 100644 ontolearner/learner/text2onto/alexbek.py diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py new file mode 100644 index 0000000..5723e36 --- /dev/null +++ b/examples/llm_learner_alexbek_rag_term_typing.py @@ -0,0 +1,50 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import AlexbekRAGLearner + +# Load the GeoNames ontology. +ontology = GeoNames() +ontology.load() + +# Extract labeled items and split into train/test sets for evaluation +train_data, test_data = train_test_split(ontology.extract(), test_size=0.2, random_state=42) + +# Configure a Retrieval-Augmented Generation (RAG) term-typing classifier. +# - llm_model_id: generator used to predict types from the prompt + retrieved examples +# - retriever_model_id: encoder used to embed items and fetch top-k similar (RAG) examples +# - device: "cuda" for GPU or "cpu" +# - top_k: number of nearest examples to retrieve per query term +# - max_new_tokens: decoding budget of the LLM during prediction +# - output_dir: where intermediate artifacts / logs can be stored +rag_learner = AlexbekRAGLearner( + llm_model_id="Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id="sentence-transformers/all-MiniLM-L6-v2", + device="cuda", + top_k=3, + max_new_tokens=256, + output_dir="./results/", +) + +# Build the pipeline and pass raw structured objects end-to-end. +# We place the RAG learner in the llm slot and set llm_id accordingly. +pipe = LearnerPipeline( + llm=rag_learner, + llm_id="Qwen/Qwen2.5-0.5B-Instruct", + ontologizer_data=True, +) + +# Run the full learning pipeline on the term-typing task +# - task="term-typing" (Task B) +# - evaluate=True computes precision/recall/F1 on the held-out test split +# - ontologizer_data=True must match the pipeline flag above +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=True, +) + +# Display the evaluation results and runtime +print("Metrics:", outputs.get("metrics")) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} +print("Elapsed time (s):", outputs.get("elapsed_time")) diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py new file mode 100644 index 0000000..c5c7454 --- /dev/null +++ b/examples/llm_learner_alexbek_rf_term_typing.py @@ -0,0 +1,54 @@ +# Import core modules from the OntoLearner library +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import AlexbekRFLearner # A random-forest term-typing learner over text+graph features + +# Load the GeoNames ontology and extract labeled term-typing data + +ontology = GeoNames() +ontology.load() + +data = ontology.extract() + +# Split the labeled term-typing data into train and test sets +train_data, test_data = train_test_split( + data, + test_size=0.2, + random_state=42 +) + +# Configure the RF-based learner (embeddings + optional graph features) +# - device: "cpu" or "cuda" +# - threshold: decision threshold for multi-label assignment +# - use_graph_features: include ontology-graph-derived features if available +rf_learner = AlexbekRFLearner( + device="cpu", # switch to "cuda" if you have a GPU + batch_size=16, + max_length=512, # max tokenizer length for embedding model inputs + threshold=0.30, # probability cutoff for assigning each type + use_graph_features=True # set False for pure RF on text embeddings only +) + +# Build the pipeline and pass raw structured objects end-to-end. +pipe = LearnerPipeline( + retriever=rf_learner, + retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory + ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} + device="cpu", + batch_size=16 +) + +# Run the full learning pipeline on the term-typing task +outputs = pipe( + train_data=train_data, + test_data=test_data, + task="term-typing", + evaluate=True, + ontologizer_data=True, +) + +# Display evaluation summary and runtime +print("Metrics:", outputs.get("metrics")) + +print("Elapsed time:", outputs["elapsed_time"]) + +print(ontology) diff --git a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py new file mode 100644 index 0000000..b78976f --- /dev/null +++ b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py @@ -0,0 +1,41 @@ +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner import AlexbekCrossAttnLearner +# 1) Load & split +ontology = GeoNames() +ontology.load() +data = ontology.extract() +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +# 2) Configure the cross-attention learner +cross_learner = AlexbekCrossAttnLearner( + embedding_model="sentence-transformers/all-MiniLM-L6-v2", # or "Qwen/Qwen2.5-1.5B-... (if wrapped as ST)" + device="cpu", + num_heads=8, + lr=5e-5, + weight_decay=0.01, + num_epochs=1, + batch_size=256, + neg_ratio=1.0, + output_dir="./results/crossattn/", + seed=42, +) + +# 3) Build pipeline +pipeline = LearnerPipeline( + llm=cross_learner, # <- our learner + llm_id="cross-attn", # label for bookkeeping + ontologizer_data=False # pass raw ontology objects as in your example +) + +# 4) Train + predict + evaluate +outputs = pipeline( + train_data=train_data, + test_data=test_data, + task="taxonomy-discovery", + evaluate=True, + ontologizer_data=False, +) + +print("Metrics:", outputs.get("metrics")) +print("Elapsed time:", outputs["elapsed_time"]) +print(outputs) diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py new file mode 100644 index 0000000..caf4c5b --- /dev/null +++ b/examples/llm_learner_alexbek_text2onto.py @@ -0,0 +1,74 @@ +import os +import json +import torch + +# LocalAutoLLM handles model loading/generation; AlexbekFewShotLearner provides fit/predict APIs +from ontolearner.learner.text2onto.alexbek import LocalAutoLLM, AlexbekFewShotLearner + +# Local folder where the dataset is stored (relative to this script) +DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" + +# Input paths (already saved) +TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") +TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") +TEST_DOCS_FULL_PATH = os.path.join(DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl") + +# Output paths +DOC_TERMS_OUT_PATH = os.path.join(DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl") +TERMS2TYPES_OUT_PATH = os.path.join(DATA_DIR, "test", "terms2types_pred_ecology.fast.json") +TYPES2DOCS_OUT_PATH = os.path.join(DATA_DIR, "test", "types2docs_pred_ecology.fast.json") + +# Device selection +DEVICE = ( + "cuda" + if torch.cuda.is_available() + else ("mps" if torch.backends.mps.is_available() else "cpu") +) + +# Model config +MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" +LOAD_IN_4BIT = (DEVICE == "cuda") # 4-bit helps on GPU + +# 1) Load LLM +llm = LocalAutoLLM(device=DEVICE) +llm.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT) + +# 2) Build few-shot exemplars from training split +learner = AlexbekFewShotLearner(model=llm, device=DEVICE) +learner.fit( + train_docs_jsonl=TRAIN_DOCS_PATH, + terms2doc_json=TRAIN_TERMS2DOCS_PATH, + # use defaults for sample size/seed +) + +# 3) Predict terms per test document +os.makedirs(os.path.dirname(DOC_TERMS_OUT_PATH), exist_ok=True) +num_written_doc_terms = learner.predict_terms( + docs_test_jsonl=TEST_DOCS_FULL_PATH, + out_jsonl=DOC_TERMS_OUT_PATH, + # use defaults for max_new_tokens and few_shot_k +) +print(f"[terms] wrote {num_written_doc_terms} lines → {DOC_TERMS_OUT_PATH}") + +# 4) Predict types for extracted terms, using the JSONL we just wrote +typing_summary = learner.predict_types_from_terms( + doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly + doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) + model_id=MODEL_ID, # reuse the same small model + out_terms2types=TERMS2TYPES_OUT_PATH, + out_types2docs=TYPES2DOCS_OUT_PATH, + # use defaults for everything else +) + +print(f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types") +print(f"[saved] {TERMS2TYPES_OUT_PATH}") +print(f"[saved] {TYPES2DOCS_OUT_PATH}") + +# 5) Small preview of term→types +try: + with open(TERMS2TYPES_OUT_PATH, "r", encoding="utf-8") as fin: + preview = json.load(fin)[:3] + print("[preview] first 3:") + print(json.dumps(preview, ensure_ascii=False, indent=2)) +except Exception as e: + print(f"[preview] skipped: {e}") diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 49b94c4..5ebd3f6 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -36,7 +36,11 @@ SKHNLPSequentialFTLearner, SBUNLPFewShotLearner, SBUNLPZSLearner, - SBUNLPText2OntoLearner) + SBUNLPText2OntoLearner, + AlexbekCrossAttnLearner, + AlexbekRFLearner, + AlexbekRAGLearner, + AlexbekFewShotLearner) from ._learner import LearnerPipeline from .processor import Processor @@ -61,6 +65,10 @@ "SBUNLPFewShotLearner", "SBUNLPZSLearner", "SBUNLPText2OntoLearner", + "AlexbekCrossAttnLearner", + "AlexbekRFLearner", + "AlexbekRAGLearner", + "AlexbekFewShotLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 4f41586..71020e8 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -23,3 +23,6 @@ from .taxonomy_discovery.sbunlp import SBUNLPFewShotLearner from .term_typing.sbunlp import SBUNLPZSLearner from .text2onto import SBUNLPFewShotLearner as SBUNLPText2OntoLearner +from .taxonomy_discovery.alexbek import AlexbekCrossAttnLearner +from .term_typing.alexbek import AlexbekRFLearner, AlexbekRAGLearner +from .text2onto.alexbek import AlexbekFewShotLearner diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py index d52513b..57a845b 100644 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ b/ontolearner/learner/taxonomy_discovery/__init__.py @@ -15,3 +15,4 @@ from .rwthdbis import RWTHDBISSFTLearner from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner from .sbunlp import SBUNLPFewShotLearner +from .alexbek import AlexbekCrossAttnLearner diff --git a/ontolearner/learner/taxonomy_discovery/alexbek.py b/ontolearner/learner/taxonomy_discovery/alexbek.py new file mode 100644 index 0000000..616d50f --- /dev/null +++ b/ontolearner/learner/taxonomy_discovery/alexbek.py @@ -0,0 +1,305 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple + +import math +import os +import random +import torch +import torch.nn as nn +import torch.nn.functional as F +from sentence_transformers import SentenceTransformer + +from ...base import AutoLearner + +class RMSNorm(nn.Module): + """Root Mean Square normalization with learnable scale. + + Computes: y = weight * x / sqrt(mean(x^2) + eps) + """ + + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + rms_inv = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + return self.weight * (x * rms_inv) + +class CrossAttentionHead(nn.Module): + """Minimal multi-head *pair* scorer using cross-attention-style projections. + + Given child vector c and parent vector p: + q = Wq * c, k = Wk * p + per-head score = (q_h · k_h) / sqrt(d_head) + aggregate by mean across heads, then sigmoid to get probability. + """ + + def __init__(self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6): + super().__init__() + assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads" + self.hidden_size = hidden_size + self.num_heads = num_heads + self.dim_per_head = hidden_size // num_heads + + # Linear projections for queries (child) and keys (parent) + self.query_projection = nn.Linear(hidden_size, hidden_size, bias=False) + self.key_projection = nn.Linear(hidden_size, hidden_size, bias=False) + + # Pre-projection normalization for stability + self.query_norm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.key_norm = RMSNorm(hidden_size, eps=rms_norm_eps) + + # Xavier init helps stabilize training + nn.init.xavier_uniform_(self.query_projection.weight) + nn.init.xavier_uniform_(self.key_projection.weight) + + def forward(self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor) -> torch.Tensor: + """Score (child, parent) pairs. + + Args: + child_embeddings: Tensor of shape (batch, hidden_size) + parent_embeddings: Tensor of shape (batch, hidden_size) + Returns: + Tensor of probabilities with shape (batch,) + """ + batch_size, _ = child_embeddings.shape + + # Project and normalize + queries = self.query_norm(self.query_projection(child_embeddings)) + keys = self.key_norm(self.key_projection(parent_embeddings)) + + # Reshape into heads: (batch, heads, dim_per_head) + queries = queries.view(batch_size, self.num_heads, self.dim_per_head) + keys = keys.view(batch_size, self.num_heads, self.dim_per_head) + + # Scaled dot-product similarity per head -> (batch, heads) + per_head_scores = (queries * keys).sum(-1) / math.sqrt(self.dim_per_head) + + # Aggregate across heads -> (batch,) + mean_score = per_head_scores.mean(-1) + + # Map to probability + return torch.sigmoid(mean_score) + +class AlexbekCrossAttnLearner(AutoLearner): + """Cross-Attention Taxonomy Learner (inherits AutoLearner). + + - Encodes type strings with a SentenceTransformer. + - Trains a small cross-attention head to score (parent, child) edges. + - Predicts probabilities for provided pairs. + + Helper functions live in this same module (below), *not* as class methods. + """ + + def __init__( + self, + *, + embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "cpu", + num_heads: int = 8, + lr: float = 5e-5, + weight_decay: float = 0.01, + num_epochs: int = 1, + batch_size: int = 256, + neg_ratio: float = 1.0, # negatives per positive + output_dir: str = "./results/", + seed: int = 42, + **kwargs: Any, + ): + """Configure the learner. + + All configuration is kept directly on the learner (no separate Config class). + """ + super().__init__(**kwargs) + + # ----- hyperparameters / settings ----- + self.embedding_model_id = embedding_model + self.requested_device = device + self.num_heads = num_heads + self.learning_rate = lr + self.weight_decay = weight_decay + self.num_epochs = num_epochs + self.batch_size = batch_size + self.negative_ratio = neg_ratio + self.output_dir = output_dir + self.seed = seed + + # Prefer requested device but gracefully fall back to CPU + if torch.cuda.is_available() or self.requested_device == "cpu": + self.device = torch.device(self.requested_device) + else: + self.device = torch.device("cpu") + + # Will be set in load() + self.embedder: Optional[SentenceTransformer] = None + self.cross_attn_head: Optional[CrossAttentionHead] = None + self.embedding_dim: Optional[int] = None + + # Cache of term -> embedding tensor (on device) + self.term_to_vector: Dict[str, torch.Tensor] = {} + + os.makedirs(self.output_dir, exist_ok=True) + random.seed(self.seed) + torch.manual_seed(self.seed) + + def load(self, **kwargs: Any): + """Load the sentence embedding model and initialize the cross-attention head.""" + model_id = kwargs.get("embedding_model", self.embedding_model_id) + self.embedder = SentenceTransformer(model_id, trust_remote_code=True, device=str(self.device)) + + # Probe output dimensionality using a dummy encode + probe_embedding = self.embedder.encode(["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False) + self.embedding_dim = int(probe_embedding.shape[-1]) + + # Initialize the cross-attention head + self.cross_attn_head = CrossAttentionHead(hidden_size=self.embedding_dim, num_heads=self.num_heads).to( + self.device + ) + + def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + if self.embedder is None or self.cross_attn_head is None: + self.load() + + if not test: + positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + self._ensure_term_embeddings(unique_terms) + negative_pairs = self._sample_negative_pairs( + positive_pairs, unique_terms, ratio=self.negative_ratio, seed=self.seed + ) + self._train_cross_attn_head(positive_pairs, negative_pairs) + return None + else: + candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + self._ensure_term_embeddings(unique_terms, append_only=True) + probabilities = self._score_parent_child_pairs(candidate_pairs) + + predictions = [ + {"parent": parent, "child": child, "score": float(prob), "label": int(prob >= 0.5)} + for (parent, child), prob in zip(candidate_pairs, probabilities) + ] + return predictions + + def _ensure_term_embeddings(self, terms: List[str], append_only: bool = False) -> None: + """Encode terms with the sentence embedder and store in cache. + + Args: + terms: list of unique strings to embed + append_only: if True, only embed terms missing from cache + """ + if self.embedder is None: + raise RuntimeError("Call load() before building term embeddings") + + terms_to_encode = [t for t in terms if t not in self.term_to_vector] if append_only else terms + if not terms_to_encode: + return + + embeddings = self.embedder.encode( + terms_to_encode, + convert_to_tensor=True, + normalize_embeddings=False, + batch_size=256, + show_progress_bar=False, + ) + for term, embedding in zip(terms_to_encode, embeddings): + self.term_to_vector[term] = embedding.detach().to(self.device) + + def _pairs_as_tensors(self, pairs: List[Tuple[str, str]]) -> Tuple[torch.Tensor, torch.Tensor]: + """Turn list of (parent, child) strings into two aligned tensors on device.""" + # child embeddings tensor of shape (batch, dim) + child_tensor = torch.stack([self.term_to_vector[child] for (_, child) in pairs], dim=0).to(self.device) + # parent embeddings tensor of shape (batch, dim) + parent_tensor = torch.stack([self.term_to_vector[parent] for (parent, _) in pairs], dim=0).to(self.device) + return child_tensor, parent_tensor + + def _train_cross_attn_head(self, positive_pairs: List[Tuple[str, str]], negative_pairs: List[Tuple[str, str]]) -> None: + """Train the cross-attention head with BCE loss on labeled pairs.""" + if self.cross_attn_head is None: + raise RuntimeError("Head not initialized. Call load().") + + self.cross_attn_head.train() + optimizer = torch.optim.AdamW( + self.cross_attn_head.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay + ) + + # Build a simple supervised dataset: 1 for positive, 0 for negative + labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [(1, pc) for pc in positive_pairs] + [ + (0, nc) for nc in negative_pairs + ] + random.shuffle(labeled_pairs) + + def iterate_minibatches(items: List[Tuple[int, Tuple[str, str]]], batch_size: int): + for start in range(0, len(items), batch_size): + yield items[start : start + batch_size] + + for epoch in range(self.num_epochs): + epoch_loss_sum = 0.0 + for minibatch in iterate_minibatches(labeled_pairs, self.batch_size): + labels = torch.tensor([y for y, _ in minibatch], dtype=torch.float32, device=self.device) + string_pairs = [pc for _, pc in minibatch] + child_tensor, parent_tensor = self._pairs_as_tensors(string_pairs) + + probs = self.cross_attn_head(child_tensor, parent_tensor) + loss = F.binary_cross_entropy(probs, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss_sum += float(loss.item()) * len(minibatch) + + + def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float]: + """Compute probability scores for (parent, child) pairs.""" + if self.cross_attn_head is None: + raise RuntimeError("Head not initialized. Call load().") + + self.cross_attn_head.eval() + scores: List[float] = [] + with torch.no_grad(): + for start in range(0, len(pairs), self.batch_size): + chunk = pairs[start : start + self.batch_size] + child_tensor, parent_tensor = self._pairs_as_tensors(chunk) + prob = self.cross_attn_head(child_tensor, parent_tensor) + scores.extend(prob.detach().cpu().tolist()) + return scores + + def _extract_parent_child_pairs_and_terms(self, data): + parent_child_pairs = [] + unique_terms = set() + for edge in getattr(data, "type_taxonomies").taxonomies: + parent, child = str(edge.parent), str(edge.child) + parent_child_pairs.append((parent, child)) + unique_terms.add(parent) + unique_terms.add(child) + return parent_child_pairs, sorted(unique_terms) + + def _sample_negative_pairs(self, positive_pairs, terms, ratio: float = 1.0, seed: int = 42): + random.seed(seed) + term_list = list(terms) + positive_set = set(positive_pairs) + negatives = [] + target_negative_count = int(len(positive_pairs) * ratio) + while len(negatives) < target_negative_count: + parent = random.choice(term_list) + child = random.choice(term_list) + if parent == child: + continue + candidate = (parent, child) + if candidate in positive_set: + continue + negatives.append(candidate) + return negatives diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py index ebd8cd9..a42d716 100644 --- a/ontolearner/learner/term_typing/__init__.py +++ b/ontolearner/learner/term_typing/__init__.py @@ -14,3 +14,4 @@ from .rwthdbis import RWTHDBISSFTLearner from .sbunlp import SBUNLPZSLearner +from .alexbek import AlexbekRFLearner, AlexbekRAGLearner diff --git a/ontolearner/learner/term_typing/alexbek.py b/ontolearner/learner/term_typing/alexbek.py new file mode 100644 index 0000000..7aa6033 --- /dev/null +++ b/ontolearner/learner/term_typing/alexbek.py @@ -0,0 +1,809 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import json +import re +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +import networkx as nx +from tqdm import tqdm +from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.ensemble import RandomForestClassifier +from sklearn.multiclass import OneVsRestClassifier + +from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM +from sentence_transformers import SentenceTransformer + +from ...base import AutoLearner, AutoRetriever + +class AlexbekRFLearner(AutoRetriever): + """ + Embedding-based multi-label classifier for *term typing*. + + Pipeline overview: + 1) Load a Hugging Face encoder (tokenizer + model). + 2) Encode input terms into sentence embeddings. + 3) Optionally augment with simple graph (co-occurrence) features. + 4) Train a One-vs-Rest RandomForest on the concatenated features. + 5) Predict multi-label types with a probability threshold (fallback to top-1). + + API expected by LearnerPipeline: + - load(model_id) + - fit(data, task, ontologizer=True) + - predict(data, task, ontologizer=True) + - tasks_ground_truth_former(data, task) + """ + + def __init__( + self, + device: str = "cpu", + batch_size: int = 16, + max_length: int = 256, + threshold: float = 0.30, + use_graph_features: bool = True, + rf_kwargs: Optional[Dict[str, Any]] = None, + ): + # Runtime / inference settings + self.device = torch.device(device) + self.batch_size = batch_size + self.max_length = max_length + self.threshold = threshold # probability cutoff for selecting labels + self.use_graph_features = use_graph_features + + # RandomForest hyperparameters (with sensible defaults) + self.rf_kwargs = rf_kwargs or dict( + n_estimators=200, max_depth=20, class_weight="balanced", random_state=42 + ) + + # Filled during load/fit + self.model_name: Optional[str] = None + self.tokenizer: Optional[AutoTokenizer] = None + self.embedding_model: Optional[AutoModel] = None + + # Label processing / classifier / optional graph + self.label_binarizer = MultiLabelBinarizer() + self.ovr_random_forest: Optional[OneVsRestClassifier] = None + self.term_graph: Optional[nx.Graph] = None + + def load(self, model_id: str, **_: Any) -> None: + """Load a Hugging Face encoder by model id (tokenizer + base model).""" + self.model_name = model_id + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + self.embedding_model = AutoModel.from_pretrained(model_id) + self.embedding_model.eval().to(self.device) + + def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: + """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features).""" + if task != "term-typing": + raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + + # Normalize incoming training data into a list of dicts: {term, types, RAG} + training_rows = self._as_term_types_dicts(data) + if not training_rows: + raise ValueError("No valid training examples found (need 'term' and 'types').") + + # Split out terms and raw labels + training_terms: List[str] = [row["term"] for row in training_rows] + raw_label_lists: List[List[str]] = [row["types"] for row in training_rows] + + # Fit label binarizer to learn label space/order + self.label_binarizer.fit(raw_label_lists) + + # Encode terms to sentence embeddings + term_embeddings_train = self._encode(training_terms) + + # Optionally build a light-weight co-occurrence graph and extract features + if self.use_graph_features: + self.term_graph = self._create_term_graph(training_rows) + graph_features_train = self._extract_graph_features(self.term_graph, training_terms) + X_train = np.hstack([term_embeddings_train, graph_features_train]) + else: + self.term_graph = None + X_train = term_embeddings_train + + # Multi-label targets (multi-hot) + Y_train = self.label_binarizer.transform(raw_label_lists) + + # One-vs-Rest RandomForest (one binary RF per label) + self.ovr_random_forest = OneVsRestClassifier(RandomForestClassifier(**self.rf_kwargs)) + self.ovr_random_forest.fit(X_train, Y_train) + + + def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> List[Dict[str, Any]]: + """Predict multi-label types for input terms. + + Returns a list of dicts with keys: {id, term, types}. + """ + if task != "term-typing": + raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + if self.ovr_random_forest is None or self.tokenizer is None or self.embedding_model is None: + raise RuntimeError("Call load() and fit() before predict().") + + # Normalize prediction input into parallel lists of terms and example ids + test_terms, example_ids = self._as_predict_terms_ids(data) + + # Encode terms + term_embeddings_test = self._encode(test_terms) + + # Match feature layout used during training + if self.use_graph_features and self.term_graph is not None: + graph_features_test = self._extract_graph_features(self.term_graph, test_terms) + X_test = np.hstack([term_embeddings_test, graph_features_test]) + else: + X_test = term_embeddings_test + + # Probabilities per label (shape: [n_samples, n_labels]) + probability_matrix = self.ovr_random_forest.predict_proba(X_test) + + predictions: List[Dict[str, Any]] = [] + label_names = self.label_binarizer.classes_ + threshold = float(self.threshold) + + # Select labels above threshold; fallback to argmax if none exceed it + for row_index, label_probabilities in enumerate(probability_matrix): + selected_label_indices = np.where(label_probabilities > threshold)[0] + if len(selected_label_indices) == 0: + selected_label_indices = [int(np.argmax(label_probabilities))] + + predicted_types = [label_names[label_idx] for label_idx in selected_label_indices] + + predictions.append( + { + "id": example_ids[row_index], + "term": test_terms[row_index], + "types": predicted_types, + } + ) + return predictions + + def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, Any]]: + """Normalize ground-truth into a list of {id, term, types} dicts for evaluation.""" + if task != "term-typing": + raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + return self._as_gold_id_term_types(data) + + def _encode(self, texts: List[str]) -> np.ndarray: + """Encode a list of strings into L2-normalized sentence embeddings (NumPy array). + + If no texts are provided, returns an empty array with width equal to the model hidden size. + """ + assert self.tokenizer is not None and self.embedding_model is not None, "Call load(model_id) first." + + if not texts: + hidden_size = getattr(getattr(self.embedding_model, "config", None), "hidden_size", 768) + return np.zeros((0, hidden_size), dtype=np.float32) + + batch_embeddings: List[torch.Tensor] = [] + + for start_idx in tqdm(range(0, len(texts), self.batch_size), desc="Embedding"): + end_idx = start_idx + self.batch_size + batch_texts = texts[start_idx:end_idx] + + # Tokenize and move to device + tokenized_batch = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="pt", + ).to(self.device) + + # Forward pass without gradients + with torch.no_grad(): + model_output = self.embedding_model(**tokenized_batch) + + # Prefer dedicated pooler if provided; otherwise pool by last valid token + if hasattr(model_output, "pooler_output") and model_output.pooler_output is not None: + sentence_embeddings = model_output.pooler_output + else: + sentence_embeddings = self._last_token_pool( + model_output.last_hidden_state, tokenized_batch["attention_mask"] + ) + + # L2-normalize embeddings for stability + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + # Detach, move to CPU, collect + batch_embeddings.append(sentence_embeddings.detach().cpu()) + + # Best-effort memory cleanup (especially useful on CUDA) + del tokenized_batch, model_output, sentence_embeddings + if self.device.type == "cuda": + torch.cuda.empty_cache() + gc.collect() + + # Concatenate all batches and convert to NumPy + return torch.cat(batch_embeddings, dim=0).numpy() + + def _last_token_pool(self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Select the last *non-padding* token embedding for each sequence in the batch.""" + last_valid_token_idx = attention_mask.sum(dim=1) - 1 # (batch,) + batch_row_idx = torch.arange(last_hidden_states.size(0), device=last_hidden_states.device) + return last_hidden_states[batch_row_idx, last_valid_token_idx] + + def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: + """Create a simple undirected co-occurrence graph from training rows. + + Nodes: terms (with node attribute 'types'). + Edges: between a term and each neighbor from its optional RAG list. + Edge weight = number of shared types (or 0.1 if none shared). + """ + graph = nx.Graph() + + for row in training_rows: + term = row["term"] + term_types = row.get("types", []) + graph.add_node(term, types=term_types) + + # RAG may be a list of neighbor dicts like {"term": ..., "types": [...]} + for neighbor in (row.get("RAG", []) or []): + neighbor_term = neighbor.get("term") + neighbor_types = neighbor.get("types", []) + + # Shared-type-based edge weight (weak edge if no overlap) + shared_types = set(term_types).intersection(set(neighbor_types)) + edge_weight = float(len(shared_types)) if shared_types else 0.1 + + graph.add_edge(term, neighbor_term, weight=edge_weight) + + return graph + + def _extract_graph_features(self, term_graph: nx.Graph, terms: List[str]) -> np.ndarray: + """Compute simple per-term graph features. + + For each term we compute a 4-dim vector: + [degree, clustering_coefficient, degree_centrality, pagerank_score] + Returns an array of shape [len(terms), 4]. + """ + if len(term_graph): + degree_centrality = nx.degree_centrality(term_graph) + pagerank_scores = nx.pagerank(term_graph) + else: + degree_centrality, pagerank_scores = {}, {} + + feature_rows: List[List[float]] = [] + for term in terms: + if term in term_graph: + feature_rows.append( + [ + float(term_graph.degree(term)), + float(nx.clustering(term_graph, term)), + float(degree_centrality.get(term, 0.0)), + float(pagerank_scores.get(term, 0.0)), + ] + ) + else: + feature_rows.append([0.0, 0.0, 0.0, 0.0]) + + return np.asarray(feature_rows, dtype=np.float32) + + def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: + """Normalize diverse training data formats to a list of dicts: {term, types, RAG}.""" + normalized_rows: List[Dict[str, Any]] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for item in term_typings_attr: + term_text = getattr(item, "term", None) + type_list = getattr(item, "types", None) + rag_neighbors = getattr(item, "RAG", None) + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + ) + return normalized_rows + + # Otherwise: must be a list/tuple-like container + if not isinstance(data, (list, tuple)): + raise ValueError("Training data must be a list/tuple or expose .term_typings") + + if not data: + return normalized_rows + + # Case 2: list of dicts + if isinstance(data[0], dict): + for row in data: + term_text = row.get("term") + type_list = row.get("types") + rag_neighbors = row.get("RAG") + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + ) + return normalized_rows + + # Case 3: list of tuples/lists: (term, types[, RAG]) + for item in data: + if not isinstance(item, (list, tuple)) or len(item) < 2: + continue + term_text, type_list = item[0], item[1] + rag_neighbors = item[2] if len(item) > 2 else None + if term_text is None or type_list is None: + continue + if not isinstance(type_list, list): + type_list = [type_list] + normalized_rows.append( + {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + ) + + return normalized_rows + + def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: + """Normalize prediction input into parallel lists: (terms, ids).""" + terms: List[str] = [] + example_ids: List[Any] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for idx, item in enumerate(term_typings_attr): + terms.append(str(getattr(item, "term", ""))) + example_ids.append(getattr(item, "id", getattr(item, "ID", idx))) + return terms, example_ids + + # Case 2: list/tuple container + if isinstance(data, (list, tuple)) and data: + first_element = data[0] + + # 2a) list of dicts + if isinstance(first_element, dict): + for i, row in enumerate(data): + terms.append(str(row.get("term", ""))) + example_ids.append(row.get("id", row.get("ID", i))) + return terms, example_ids + + # 2b) list of tuples/lists: (term, id[, ...]) + if isinstance(first_element, (list, tuple)): + for i, tuple_row in enumerate(data): + if not tuple_row: + continue + terms.append(str(tuple_row[0])) + example_ids.append(tuple_row[1] if len(tuple_row) > 1 else i) + return terms, example_ids + + # 2c) list of strings (terms only) + if isinstance(first_element, str): + terms = [str(x) for x in data] # type: ignore[arg-type] + example_ids = list(range(len(terms))) + return terms, example_ids + + raise ValueError("Unsupported predict() input format.") + + def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: + """Normalize gold labels into a list of dicts: {id, term, types}.""" + gold_rows: List[Dict[str, Any]] = [] + + # Case 1: object with attribute `.term_typings` + term_typings_attr = getattr(data, "term_typings", None) + if term_typings_attr is not None: + for idx, item in enumerate(term_typings_attr): + gold_id = getattr(item, "id", getattr(item, "ID", idx)) + term_text = str(getattr(item, "term", "")) + type_list = getattr(item, "types", []) + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + return gold_rows + + # Case 2: list/tuple container + if isinstance(data, (list, tuple)) and data: + first_element = data[0] + + # 2a) list of dicts + if isinstance(first_element, dict): + for i, row in enumerate(data): + gold_id = row.get("id", row.get("ID", i)) + term_text = str(row.get("term", "")) + type_list = row.get("types", []) + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + return gold_rows + + # 2b) list of tuples/lists: (term, types[, id]) + if isinstance(first_element, (list, tuple)): + for i, tuple_row in enumerate(data): + if not tuple_row or len(tuple_row) < 2: + continue + term_text = str(tuple_row[0]) + type_list = tuple_row[1] + gold_id = tuple_row[2] if len(tuple_row) > 2 else i + if not isinstance(type_list, list): + type_list = [type_list] + gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + return gold_rows + + raise ValueError("Unsupported ground-truth input format for tasks_ground_truth_former().") + +class AlexbekRAGLearner(AutoLearner): + """Retrieval-Augmented Term Typing learner (single task: term-typing). + + Flow: + 1) fit: collect (term -> [types]) examples, build an in-memory index + using a sentence-embedding model. + 2) predict: for each new term, retrieve top-k similar examples, compose a + structured prompt, query an instruction-tuned causal LLM, and parse types. + + Returns a list of dicts: {"term": str, "types": List[str], "id": Optional[str]}. + """ + + def __init__( + self, + llm_model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", + retriever_model_id: str = "sentence-transformers/all-MiniLM-L6-v2", + device: str = "auto", # "auto" | "cuda" | "cpu" + token: str = "", # HF token if needed + top_k: int = 3, + max_new_tokens: int = 256, + gen_batch_size: int = 4, # generation batch size + enc_batch_size: int = 64, # embedding batch size + **kwargs: Any, # absorb extra pipeline-style args + ) -> None: + super().__init__() + + # Consolidated configuration for simple serialization + self.cfg: Dict[str, Any] = { + "llm_model_id": llm_model_id, + "retriever_model_id": retriever_model_id, + "device": device, + "token": token, + "top_k": int(top_k), + "max_new_tokens": int(max_new_tokens), + "gen_batch_size": int(gen_batch_size), + "enc_batch_size": int(enc_batch_size), + } + self.extra_cfg: Dict[str, Any] = dict(kwargs) + + # LLM components + self.tokenizer: Optional[AutoTokenizer] = None + self.generation_model: Optional[AutoModelForCausalLM] = None + + # Retriever components + self.embedder: Optional[SentenceTransformer] = None + self.indexed_corpus: List[str] = [] # items: " || [...]" + self.corpus_embeddings: Optional[torch.Tensor] = None + + # Training cache of (term, [types]) tuples + self.train_term_types: List[Tuple[str, List[str]]] = [] + + # Prompt templates + self._system_prompt: str = ( + "You are an expert in ontologies and semantic term classification.\n" + "Task: determine semantic types for the TERM using the EXAMPLES provided.\n" + "Rules:\n" + "1) Types must be generalizing categories from the domain ontology.\n" + "2) Be concise. Respond ONLY in JSON using double quotes.\n" + 'Format: {"term":"...", "reasoning":"<<=100 words>>", "types":["...", "..."]}\n' + ) + self._user_prompt_template: str = ( + """{examples} + + TERM: {term} + + TASK: Determine semantic types for the given term based on the domain ontology. + Remember: types are generalizing categories, not the term itself. Respond in JSON. + """ + ) + + def load( + self, + model_id: Optional[str] = None, + retriever_id: Optional[str] = None, + device: Optional[str] = None, + token: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Load the LLM and the embedding retriever. Overrides constructor values if provided.""" + if model_id is not None: + self.cfg["llm_model_id"] = model_id + if retriever_id is not None: + self.cfg["retriever_model_id"] = retriever_id + if device is not None: + self.cfg["device"] = device + if token is not None: + self.cfg["token"] = token + self.extra_cfg.update(kwargs) + + # Choose device & dtype for the LLM + cuda_available: bool = torch.cuda.is_available() + use_cuda: bool = cuda_available and (self.cfg["device"] != "cpu") + device_map: str = "auto" if use_cuda else "cpu" + torch_dtype = torch.bfloat16 if use_cuda else torch.float32 + + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + self.cfg["llm_model_id"], padding_side="left", token=self.cfg["token"] + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # LLM + self.generation_model = AutoModelForCausalLM.from_pretrained( + self.cfg["llm_model_id"], + device_map=device_map, + torch_dtype=torch_dtype, + token=self.cfg["token"], + ) + + # Deterministic decoding defaults + generation_cfg = self.generation_model.generation_config + generation_cfg.do_sample = False + generation_cfg.temperature = None + generation_cfg.top_p = None + generation_cfg.top_k = None + generation_cfg.num_beams = 1 + + # Retriever + self.embedder = SentenceTransformer(self.cfg["retriever_model_id"], trust_remote_code=True) + + def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: + """Prepare the retrieval index from training examples.""" + if task != "term-typing": + return super().fit(train_data, task, ontologizer) + + # Normalize incoming training data -> list[(term, [types])] + self.train_term_types = self._unpack_train(train_data) + + # Build the textual corpus to index + self.indexed_corpus = [ + f"{term} || {json.dumps(types, ensure_ascii=False)}" for term, types in self.train_term_types + ] + + # Embed the corpus if available; else fall back to zero-shot prompting + if self.indexed_corpus and self.embedder is not None: + self.corpus_embeddings = self._encode_texts(self.indexed_corpus) + else: + self.corpus_embeddings = None + + def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: + """Predict types for evaluation items; returns a list of {term, types, id?}.""" + if task != "term-typing": + return super().predict(eval_data, task, ontologizer) + + eval_terms, eval_ids = self._unpack_eval(eval_data) + if not eval_terms: + return [] + + # Use RAG if we have an indexed corpus & embeddings; otherwise zero-shot + rag_available = ( + self.corpus_embeddings is not None and self.embedder is not None and len(self.indexed_corpus) > 0 + ) + + if rag_available: + neighbor_docs_per_query = self._retrieve_batch(eval_terms, top_k=int(self.cfg["top_k"])) + else: + neighbor_docs_per_query = [[] for _ in eval_terms] + + # Compose prompts + prompts: List[str] = [] + for term, neighbor_docs in zip(eval_terms, neighbor_docs_per_query): + example_pairs = self._decode_examples(neighbor_docs) + examples_block = self._format_examples(example_pairs) + prompt_text = self._compose_prompt(examples_block, term) + prompts.append(prompt_text) + + predicted_types_lists = self._generate_and_parse(prompts) + + # Build standardized results + results: List[Dict[str, Any]] = [] + for term, example_id, predicted_types in zip(eval_terms, eval_ids, predicted_types_lists): + result_row: Dict[str, Any] = { + "term": term, + "types": sorted({t for t in predicted_types}), # unique + sorted + } + if example_id is not None: + result_row["id"] = example_id + results.append(result_row) + + assert all(("term" in row and "types" in row) for row in results), "predict() must return term + types" + return results + + def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: + """Extract (term, [types]) tuples from supported training payloads.""" + term_typings = getattr(data, "term_typings", None) + if term_typings is not None: + parsed_pairs: List[Tuple[str, List[str]]] = [] + for item in term_typings: + term = getattr(item, "term", None) + types = list(getattr(item, "types", []) or []) + if term and types: + parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + return parsed_pairs + + if isinstance(data, list) and data and isinstance(data[0], dict): + parsed_pairs = [] + for row in data: + term = row.get("term") + types = row.get("types") or [] + if term and isinstance(types, list) and types: + parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + return parsed_pairs + + # If only a list of strings is provided, there's nothing to index for RAG + if isinstance(data, (list, set, tuple)) and all(isinstance(x, str) for x in data): + return [] + + return [] + + def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: + """Extract (terms, ids) from supported evaluation payloads.""" + term_typings = getattr(data, "term_typings", None) + if term_typings is not None: + terms: List[str] = [] + ids: List[Optional[str]] = [] + for item in term_typings: + terms.append(getattr(item, "term", "")) + ids.append(getattr(item, "id", None)) + return terms, ids + + if isinstance(data, list) and data and isinstance(data[0], str): + return list(data), [None] * len(data) + + if isinstance(data, list) and data and isinstance(data[0], dict): + terms: List[str] = [] + ids: List[Optional[str]] = [] + for row in data: + terms.append(row.get("term", "")) + ids.append(row.get("id")) + return terms, ids + + return [], [] + + def _encode_texts(self, texts: List[str]) -> torch.Tensor: + """Encode a batch of texts with the sentence-embedding model.""" + batch_size = int(self.cfg["enc_batch_size"]) + batch_embeddings: List[torch.Tensor] = [] + + for batch_start in range(0, len(texts), batch_size): + batch_texts = texts[batch_start : batch_start + batch_size] + embeddings = self.embedder.encode(batch_texts, convert_to_tensor=True, show_progress_bar=False) + batch_embeddings.append(embeddings) + + return torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + + def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: + """Return for each query the top-k most similar corpus entries (as raw text rows).""" + if self.corpus_embeddings is None or not self.indexed_corpus: + return [[] for _ in queries] + + query_embeddings = self._encode_texts(queries) # [Q, D] + doc_embeddings = self.corpus_embeddings # [N, D] + if query_embeddings.shape[-1] != doc_embeddings.shape[-1]: + raise ValueError( + f"Embedding dim mismatch: {query_embeddings.shape[-1]} vs {doc_embeddings.shape[-1]}" + ) + + # Cosine similarity via L2-normalized dot product + q_norm = F.normalize(query_embeddings, p=2, dim=1) + d_norm = F.normalize(doc_embeddings, p=2, dim=1) + cos_sim = torch.matmul(q_norm, d_norm.T) # [Q, N] + + k = min(max(1, top_k), len(self.indexed_corpus)) + _, top_indices = torch.topk(cos_sim, k=k, dim=1) + return [[self.indexed_corpus[j] for j in row.tolist()] for row in top_indices] + + def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: + """Parse raw corpus rows ('term || [types]') into (term, [types]) pairs.""" + example_pairs: List[Tuple[str, List[str]]] = [] + for raw_row in docs: + try: + term_raw, types_json = raw_row.split("||", 1) + term = term_raw.strip() + types_list = json.loads(types_json.strip()) + if isinstance(types_list, list): + example_pairs.append((term, [t for t in types_list if isinstance(t, str)])) + except Exception: + continue + return example_pairs + + def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: + """Format retrieved example pairs into a compact block for the prompt.""" + if not pairs: + return "EXAMPLES: (none provided)" + lines: List[str] = ["CLASSIFICATION EXAMPLES:"] + for idx, (term, types) in enumerate(pairs, 1): + preview_types = types[:3] # keep context small + lines.append(f"{idx}. Term: '{term}' → Types: {list(preview_types)}") + lines.append("END OF EXAMPLES.") + return "\n".join(lines) + + def _compose_prompt(self, examples_block: str, term: str) -> str: + """Compose the final prompt from system + user blocks.""" + user_block = self._user_prompt_template.format(examples=examples_block, term=term) + return f"{self._system_prompt}\n\n{user_block}\n" + + def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: + """Run generation for a batch of prompts and parse the JSON 'types' from outputs.""" + batch_size = int(self.cfg["gen_batch_size"]) + all_predicted_types: List[List[str]] = [] + + for batch_start in range(0, len(prompts), batch_size): + prompt_batch = prompts[batch_start : batch_start + batch_size] + + # Tokenize and move to the LLM's device + model_device = getattr(self.generation_model, "device", None) + encodings = self.tokenizer(prompt_batch, return_tensors="pt", padding=True).to(model_device) + input_token_length = encodings["input_ids"].shape[1] + + # Deterministic decoding (greedy) + with torch.no_grad(): + generated_tokens = self.generation_model.generate( + **encodings, + do_sample=False, + num_beams=1, + temperature=None, + top_p=None, + top_k=None, + max_new_tokens=int(self.cfg["max_new_tokens"]), + pad_token_id=self.tokenizer.eos_token_id, + ) + + # Slice off the prompt tokens and decode only newly generated tokens + new_token_span = generated_tokens[:, input_token_length:] + decoded_texts = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in new_token_span] + + parsed_types_per_prompt = [self._parse_types(text) for text in decoded_texts] + all_predicted_types.extend(parsed_types_per_prompt) + + return all_predicted_types + + def _parse_types(self, text: str) -> List[str]: + """Extract a list of type strings from LLM output. + + Attempts (in order): + 1) Strict JSON object with "types". + 2) Regex-extract JSON object containing "types". + 3) Regex-extract first bracketed list. + 4) Comma-split fallback. + """ + try: + obj = json.loads(text) + if isinstance(obj, dict) and isinstance(obj.get("types"), list): + return [t for t in obj["types"] if isinstance(t, str)] + except Exception: + pass + + try: + obj_match = re.search(r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S) + if obj_match: + obj = json.loads(obj_match.group(0)) + types = obj.get("types", []) + return [t for t in types if isinstance(t, str)] + except Exception: + pass + + try: + list_match = re.search(r'\[([^\]]+)\]', text) + if list_match: + items = [x.strip().strip('"').strip("'") for x in list_match.group(1).split(",")] + return [t for t in items if t] + except Exception: + pass + + if "," in text: + items = [x.strip().strip('"').strip("'") for x in text.split(",")] + return [t for t in items if t] + + return [] diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py index 30e8372..6408881 100644 --- a/ontolearner/learner/text2onto/__init__.py +++ b/ontolearner/learner/text2onto/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .sbunlp import SBUNLPFewShotLearner +from .alexbek import AlexbekFewShotLearner diff --git a/ontolearner/learner/text2onto/alexbek.py b/ontolearner/learner/text2onto/alexbek.py new file mode 100644 index 0000000..5760dca --- /dev/null +++ b/ontolearner/learner/text2onto/alexbek.py @@ -0,0 +1,1084 @@ +# Copyright (c) 2025 SciKnowOrg +# +# Licensed under the MIT License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple, Iterable +import json +from json.decoder import JSONDecodeError +import os +import random +import re + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +from ...base import AutoLearner, AutoLLM + +try: + from outlines.models import Transformers as OutlinesTFModel + from outlines.generate import json as outlines_generate_json + from pydantic import BaseModel + + class _PredictedTypesSchema(BaseModel): + """Schema used when generating structured JSON { "types": [...] }.""" + types: List[str] + + OUTLINES_AVAILABLE: bool = True +except Exception: + # If outlines is unavailable, we will fall back to greedy decoding + regex parsing. + OUTLINES_AVAILABLE = False + _PredictedTypesSchema = None + OutlinesTFModel = None + outlines_generate_json = None + +class LocalAutoLLM(AutoLLM): + """ + Minimal local LLM helper. + + - Inherits AutoLLM but overrides load/generate to avoid label_mapper. + - Optional 4-bit loading with `load_in_4bit=True` in .load(). + - Greedy decoding by default (deterministic). + """ + + def __init__(self, device: str = "cpu", token: str = "") -> None: + """ + Initialize the local LLM holder. + + Parameters + ---------- + device : str + Execution device: "cpu" or "cuda". + token : str + Optional auth token for private model hubs. + """ + super().__init__(label_mapper=None, device=device, token=token) + self.model: Optional[AutoModelForCausalLM] = None + self.tokenizer: Optional[AutoTokenizer] = None + + def load(self, model_id: str, *, load_in_4bit: bool = False) -> None: + """ + Load a Hugging Face causal model + tokenizer and set deterministic + generation defaults. + + Parameters + ---------- + model_id : str + Model identifier resolvable by HF `from_pretrained`. + load_in_4bit : bool + If True and bitsandbytes is available, load using 4-bit quantization. + """ + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, padding_side="left", token=self.token + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Model (optionally quantized) + if load_in_4bit: + from transformers import BitsAndBytesConfig + + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + quantization_config=quantization_config, + token=self.token, + ) + else: + device_map = "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + token=self.token, + ) + + # Deterministic generation defaults + generation_cfg = self.model.generation_config + generation_cfg.do_sample = False + generation_cfg.temperature = None + generation_cfg.top_k = None + generation_cfg.top_p = None + generation_cfg.num_beams = 1 + + def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: + """ + Greedy-generate continuations for a list of prompts. + + Parameters + ---------- + prompts : List[str] + Prompts to generate for (batched). + max_new_tokens : int + Maximum number of new tokens per continuation. + + Returns + ------- + List[str] + Decoded new-token texts (no special tokens, stripped). + """ + if self.model is None or self.tokenizer is None: + raise RuntimeError("Call .load(model_id) on LocalAutoLLM before generate().") + + tokenized_batch = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True) + input_seq_len = tokenized_batch["input_ids"].shape[1] + tokenized_batch = {k: v.to(self.model.device) for k, v in tokenized_batch.items()} + + with torch.no_grad(): + outputs = self.model.generate( + **tokenized_batch, + max_new_tokens=max_new_tokens, + pad_token_id=self.tokenizer.eos_token_id, + do_sample=False, + num_beams=1, + ) + + # Only return the newly generated part for each row in the batch + continuation_token_ids = outputs[:, input_seq_len:] + return [self.tokenizer.decode(row, skip_special_tokens=True).strip() for row in continuation_token_ids] + +class AlexbekFewShotLearner(AutoLearner): + """ + Text2Onto learner for LLMS4OL Task A (term & type extraction). + + Public API (A1 + convenience): + - fit(train_docs_jsonl, terms2doc_json, sample_size=24, seed=42) + - predict_terms(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int + - predict_types(docs_test_jsonl, out_jsonl, max_new_tokens=128, few_shot_k=6) -> int + - evaluate_extraction_f1(gold_item2docs_json, preds_jsonl, key="term"|"type") -> float + + Option A (A2, term→types) bridge: + - predict_types_from_terms_option_a(...) + Reads your A1 results (docs→terms), predicts types for each term, and + writes two files: terms2types_pred.json + types2docs_pred.json + """ + def __init__(self, model: LocalAutoLLM, device: str = "cpu", **_: Any) -> None: + """ + Initialize learner state and canned prompts. + + Parameters + ---------- + model : LocalAutoLLM + Loaded local LLM helper instance. + device : str + Device name ("cpu" or "cuda"). + """ + super().__init__(**_) + self.model = model + self.device = device + + # Few-shot exemplars for A1 (Docs→Terms) and for Docs→Types: + # Each exemplar is a tuple: (title, text, gold_list) + self._fewshot_terms_docs: List[Tuple[str, str, List[str]]] = [] + self._fewshot_types_docs: List[Tuple[str, str, List[str]]] = [] + + # System prompts + self._system_prompt_terms = ( + "You are an expert in ontology term extraction.\n" + "Extract only terms that explicitly appear in the document.\n" + 'Answer strictly as JSON: {"terms": ["..."]}\n' + ) + self._system_prompt_types = ( + "You are an expert in ontology type classification.\n" + "List ontology *types* that characterize the document’s terminology.\n" + 'Answer strictly as JSON: {"types": ["..."]}\n' + ) + + # Compiled regex for robust JSON extraction from LLM outputs + self._json_object_regex = re.compile(r"\{[^{}]*\}", re.S) + self._json_array_regex = re.compile(r"\[[^\]]*\]", re.S) + + # Term→Types (Option A) specific prompt + self._system_prompt_term_to_types = ( + "You are an expert in ontology and semantic type classification.\n" + "Given a term, predict its semantic types from the domain-specific ontology.\n" + 'Answer strictly as JSON:\n{"types": ["type1", "type2", "..."]}' + ) + + def fit( + self, + *, + train_docs_jsonl: str, + terms2doc_json: str, + sample_size: int = 24, + seed: int = 42, + ) -> None: + """ + Build internal few-shot exemplars from a labeled training split. + + Parameters + ---------- + train_docs_jsonl : str + Path to JSONL (or tolerant JSON/JSONL) with train documents. + terms2doc_json : str + JSON mapping item -> [doc_id,...]; "item" can be a term or type. + sample_size : int + Number of exemplar documents to keep for few-shot prompting. + seed : int + RNG seed for reproducible sampling. + """ + rng = random.Random(seed) + + # Load documents and map doc_id -> row + document_map = self._load_documents_jsonl(train_docs_jsonl) + if not document_map: + raise FileNotFoundError(f"No documents found in: {train_docs_jsonl}") + + # Load item -> [doc_ids] + item_to_docs_map = self._load_json(terms2doc_json) + if not isinstance(item_to_docs_map, dict): + raise ValueError(f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]") + + # Reverse mapping: doc_id -> [items] + doc_id_to_items_map: Dict[str, List[str]] = {} + for item_label, doc_id_list in item_to_docs_map.items(): + for doc_id in doc_id_list: + doc_id_to_items_map.setdefault(doc_id, []).append(item_label) + + # Build candidate exemplars (title, text, gold_list) + exemplar_candidates: List[Tuple[str, str, List[str]]] = [] + for doc_id, labeled_items in doc_id_to_items_map.items(): + doc_row = document_map.get(doc_id) + if not doc_row: + continue + doc_title = str(doc_row.get("title", "")) # be defensive (may be None) + doc_text = self._to_text(doc_row.get("text", "")) # string-ify list if needed + if not doc_text: + continue + gold_items = self._unique_preserve([s for s in labeled_items if isinstance(s, str)]) + if gold_items: + exemplar_candidates.append((doc_title, doc_text, gold_items)) + + if not exemplar_candidates: + raise RuntimeError("No candidate docs with items found to build few-shot exemplars.") + + chosen_exemplars = rng.sample(exemplar_candidates, k=min(sample_size, len(exemplar_candidates))) + # Reuse exemplars for both docs→terms and docs→types prompting + self._fewshot_terms_docs = chosen_exemplars + self._fewshot_types_docs = chosen_exemplars + + def predict_terms( + self, + *, + docs_test_jsonl: str, + out_jsonl: str, + max_new_tokens: int = 128, + few_shot_k: int = 6, + ) -> int: + """ + Extract terms that explicitly appear in each document. + + Writes one JSON object per line: + {"id": "", "terms": ["...", "...", ...]} + + Parameters + ---------- + docs_test_jsonl : str + Path to test/dev documents in JSONL or tolerant JSON/JSONL. + out_jsonl : str + Output JSONL path where predictions are written (one line per doc). + max_new_tokens : int + Max generation length. + few_shot_k : int + Number of few-shot exemplars to prepend per prompt. + + Returns + ------- + int + Number of lines written (i.e., number of processed documents). + """ + if self.model is None or self.model.model is None: + raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") + + test_documents = self._load_documents_jsonl(docs_test_jsonl) + prompts: List[str] = [] + document_order: List[str] = [] + + for document_id, document_row in test_documents.items(): + title = str(document_row.get("title", "")) + text = self._to_text(document_row.get("text", "")) + + fewshot_block = self._format_fewshot_block( + self._system_prompt_terms, self._fewshot_terms_docs, key="terms", k=few_shot_k + ) + user_block = self._format_user_block(title, text) + + prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") + document_order.append(document_id) + + generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) + parsed_term_lists = [self._parse_json_list(generated, key="terms") for generated in generations] + + os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) + lines_written = 0 + with open(out_jsonl, "w", encoding="utf-8") as fp_out: + for document_id, term_list in zip(document_order, parsed_term_lists): + payload = {"id": document_id, "terms": self._unique_preserve(term_list)} + fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") + lines_written += 1 + return lines_written + + + def predict_types( + self, + *, + docs_test_jsonl: str, + out_jsonl: str, + max_new_tokens: int = 128, + few_shot_k: int = 6, + ) -> int: + """ + Predict ontology types that characterize each document’s terminology. + + Writes one JSON object per line: + {"id": "", "types": ["...", "...", ...]} + + Parameters + ---------- + docs_test_jsonl : str + Path to test/dev documents in JSONL or tolerant JSON/JSONL. + out_jsonl : str + Output JSONL path where predictions are written (one line per doc). + max_new_tokens : int + Max generation length. + few_shot_k : int + Number of few-shot exemplars to prepend per prompt. + + Returns + ------- + int + Number of lines written (i.e., number of processed documents). + """ + if self.model is None or self.model.model is None: + raise RuntimeError("Load a model first: learner.model.load(MODEL_ID, ...)") + + test_documents = self._load_documents_jsonl(docs_test_jsonl) + prompts: List[str] = [] + document_order: List[str] = [] + + for document_id, document_row in test_documents.items(): + title = str(document_row.get("title", "")) + text = self._to_text(document_row.get("text", "")) + + fewshot_block = self._format_fewshot_block( + self._system_prompt_types, self._fewshot_types_docs, key="types", k=few_shot_k + ) + user_block = self._format_user_block(title, text) + + prompts.append(f"{fewshot_block}\n{user_block}\nAssistant:") + document_order.append(document_id) + + generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) + parsed_type_lists = [self._parse_json_list(generated, key="types") for generated in generations] + + os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) + lines_written = 0 + with open(out_jsonl, "w", encoding="utf-8") as fp_out: + for document_id, type_list in zip(document_order, parsed_type_lists): + payload = {"id": document_id, "types": self._unique_preserve(type_list)} + fp_out.write(json.dumps(payload, ensure_ascii=False) + "\n") + lines_written += 1 + return lines_written + + def evaluate_extraction_f1( + self, + gold_item2docs_json: str, + preds_jsonl: str, + *, + key: str = "term", + ) -> float: + """ + Compute micro-F1 over (doc_id, item) pairs. + + Parameters + ---------- + gold_item2docs_json : str + JSON mapping item -> [doc_ids]. + preds_jsonl : str + JSONL lines like {"id": "...", "terms":[...]} or {"id":"...","types":[...]}. + key : str + "term" or "type" depending on what you are evaluating. + + Returns + ------- + float + Micro-averaged F1 score. + """ + item_to_doc_ids: Dict[str, List[str]] = self._load_json(gold_item2docs_json) + + # Build gold: doc_id -> set(items) + gold_doc_to_items: Dict[str, set] = {} + for item_label, doc_id_list in item_to_doc_ids.items(): + for document_id in doc_id_list: + gold_doc_to_items.setdefault(document_id, set()).add(self._norm(item_label)) + + # Build predictions: doc_id -> set(items) + pred_doc_to_items: Dict[str, set] = {} + with open(preds_jsonl, "r", encoding="utf-8") as fp_in: + for line in fp_in: + row = json.loads(line.strip()) + document_id = str(row.get("id", "")) + items_list = row.get("terms" if key == "term" else "types", []) + pred_doc_to_items[document_id] = {self._norm(x) for x in items_list if isinstance(x, str)} + + # Micro counts + true_positive = false_positive = false_negative = 0 + all_document_ids = set(gold_doc_to_items.keys()) | set(pred_doc_to_items.keys()) + for document_id in all_document_ids: + gold_set = gold_doc_to_items.get(document_id, set()) + pred_set = pred_doc_to_items.get(document_id, set()) + true_positive += len(gold_set & pred_set) + false_positive += len(pred_set - gold_set) + false_negative += len(gold_set - pred_set) + + precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0 + recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0 + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0 + return f1 + + def predict_types_from_terms( + self, + *, + doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl + doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list + few_shot_jsonl: Optional[str] = None, # JSONL lines: {"term":"...", "types":[...]} + rag_terms_json: Optional[str] = None, # JSON list; items may contain "term" and "RAG":[...] + random_few_shot: Optional[int] = 3, + model_id: str = "Qwen/Qwen2.5-1.5B-Instruct", + use_structured_output: bool = True, + seed: int = 42, + out_terms2types: str = "terms2types_pred.json", + out_types2docs: str = "types2docs_pred.json", + ) -> Dict[str, Any]: + """ + Predict types for each unique term extracted per document and derive a types→docs map. + + Parameters + ---------- + doc_terms_jsonl : Optional[str] + Path to JSONL with lines like {"id": "...", "terms": [...]} or a JSON with {"results":[...]}. + doc_terms_list : Optional[List[Dict]] + In-memory results like [{"id":"...","extracted_terms":[...]}] or {"id":"...","terms":[...]}. + few_shot_jsonl : Optional[str] + Global few-shot exemplars: one JSON object per line with {"term": "...", "types":[...]}. + rag_terms_json : Optional[str] + Optional per-term RAG exemplars: a JSON list of {"term": "...", "RAG":[{"term": "...", "types":[...]}]}. + random_few_shot : Optional[int] + If provided, randomly select up to this many few-shot examples for each prediction. + model_id : str + HF model id used specifically for term→types predictions. + use_structured_output : bool + If True and outlines is available, enforce structured {"types":[...]} output. + seed : int + Random seed for reproducibility. + out_terms2types : str + Output JSON path for list of {"term": "...", "predicted_types":[...]}. + out_types2docs : str + Output JSON path for dict {"TYPE":[doc_ids,...], ...}. + + Returns + ------- + Dict[str, Any] + Summary with predictions and counts. + """ + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + # Load normalized document→terms results + doc_term_extractions = self._load_doc_term_extractions( + results_json_path=doc_terms_jsonl, + in_memory_results=doc_terms_list, + ) + if not doc_term_extractions: + raise ValueError("No document→terms results provided (doc_terms_jsonl/doc_terms_list).") + + # Prepare unique term list and term→doc occurrences + unique_terms = self._collect_unique_terms_from_extractions(doc_term_extractions) + term_to_doc_ids_map = self._build_term_to_doc_ids(doc_term_extractions) + + # Load optional global few-shot examples + global_few_shot_examples: List[Dict] = [] + if few_shot_jsonl and os.path.exists(few_shot_jsonl): + with open(few_shot_jsonl, "r", encoding="utf-8") as few_shot_file: + for raw_line in few_shot_file: + raw_line = raw_line.strip() + if not raw_line: + continue + try: + json_obj = json.loads(raw_line) + except Exception: + continue + if isinstance(json_obj, dict) and "term" in json_obj and "types" in json_obj: + global_few_shot_examples.append(json_obj) + + # Optional per-term RAG examples: {normalized_term -> [examples]} + rag_examples_lookup: Dict[str, List[Dict]] = {} + if rag_terms_json and os.path.exists(rag_terms_json): + try: + rag_payload = self._load_json(rag_terms_json) + if isinstance(rag_payload, list): + for rag_item in rag_payload: + if isinstance(rag_item, dict): + normalized_term = self._normalize_term(rag_item.get("term", "")) + rag_examples_lookup[normalized_term] = rag_item.get("RAG", []) + except Exception: + pass + + # Load a small chat LLM dedicated to Term→Types + typing_model, typing_tokenizer = self._load_llm_for_types(model_id) + + # Predict types per term + term_to_predicted_types_list: List[Dict] = [] + for term_text in unique_terms: + normalized_term = self._normalize_term(term_text) + + # Prefer per-term RAG for this term, else use global few-shot + few_shot_examples_for_term = rag_examples_lookup.get(normalized_term, None) or global_few_shot_examples + + # Build conversation and prompt + conversation_messages = self._build_conv_for_type_infer( + term=term_text, + few_shot_examples=few_shot_examples_for_term, + random_k=random_few_shot, + ) + typing_prompt_string = self._apply_chat_template_safe_types(typing_tokenizer, conversation_messages) + + predicted_types: List[str] = [] + raw_generation_text: str = "" + + # Structured JSON path (if requested and available) + if use_structured_output and OUTLINES_AVAILABLE and _PredictedTypesSchema is not None: + try: + outlines_model = OutlinesTFModel(typing_model, typing_tokenizer) # type: ignore + generator = outlines_generate_json(outlines_model, _PredictedTypesSchema) # type: ignore + structured = generator(typing_prompt_string, max_tokens=512) + predicted_types = [label for label in structured.types if isinstance(label, str)] + raw_generation_text = json.dumps({"types": predicted_types}, ensure_ascii=False) + except Exception: + # Fall back to greedy decoding + use_structured_output = False + + # Greedy decode fallback + if not use_structured_output or not OUTLINES_AVAILABLE or _PredictedTypesSchema is None: + tokenized_prompt = typing_tokenizer(typing_prompt_string, return_tensors="pt", truncation=True, max_length=2048) + if torch.cuda.is_available(): + tokenized_prompt = {name: tensor.cuda() for name, tensor in tokenized_prompt.items()} + with torch.no_grad(): + output_ids = typing_model.generate( + **tokenized_prompt, + max_new_tokens=256, + do_sample=False, + num_beams=1, + pad_token_id=typing_tokenizer.eos_token_id, + ) + new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1]:] + raw_generation_text = typing_tokenizer.decode(new_token_span, skip_special_tokens=True) + predicted_types = self._extract_types_from_text(raw_generation_text) + + term_to_predicted_types_list.append({ + "term": term_text, + "predicted_types": sorted(set(predicted_types)), + }) + + # 7) Build types→docs from (term→types) and (term→docs) + types_to_doc_id_set: Dict[str, set] = {} + for term_prediction in term_to_predicted_types_list: + normalized_term = self._normalize_term(term_prediction["term"]) + doc_ids_for_term = term_to_doc_ids_map.get(normalized_term, []) + for type_label in term_prediction.get("predicted_types", []): + types_to_doc_id_set.setdefault(type_label, set()).update(doc_ids_for_term) + + types_to_doc_ids: Dict[str, List[str]] = { + type_label: sorted(doc_id_set) for type_label, doc_id_set in types_to_doc_id_set.items() + } + + # 8) Save outputs + os.makedirs(os.path.dirname(out_terms2types) or ".", exist_ok=True) + with open(out_terms2types, "w", encoding="utf-8") as fp_terms2types: + json.dump(term_to_predicted_types_list, fp_terms2types, ensure_ascii=False, indent=2) + + os.makedirs(os.path.dirname(out_types2docs) or ".", exist_ok=True) + with open(out_types2docs, "w", encoding="utf-8") as fp_types2docs: + json.dump(types_to_doc_ids, fp_types2docs, ensure_ascii=False, indent=2) + + # Cleanup VRAM if any + del typing_model, typing_tokenizer + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return { + "terms2types_pred": term_to_predicted_types_list, + "types2docs_pred": types_to_doc_ids, + "unique_terms": len(unique_terms), + "types_count": len(types_to_doc_ids), + } + + def _load_json(self, path: str) -> Dict[str, Any]: + """Load a JSON file from disk and return its parsed object.""" + with open(path, "r", encoding="utf-8") as file_obj: + return json.load(file_obj) + + + def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: + """ + Iterate over *all* JSON objects found inside a string. + + Supports cases where multiple JSON objects are concatenated back-to-back + in a single line. It skips stray commas/whitespace between objects. + + Parameters + ---------- + blob : str + A string that may contain one or more JSON objects. + + Yields + ------ + Dict[str, Any] + Each parsed JSON object. + """ + json_decoder = json.JSONDecoder() + cursor_index, text_length = 0, len(blob) + while cursor_index < text_length: + # Skip whitespace/commas between objects + while cursor_index < text_length and blob[cursor_index] in " \t\r\n,": + cursor_index += 1 + if cursor_index >= text_length: + break + try: + json_obj, end_index = json_decoder.raw_decode(blob, idx=cursor_index) + except JSONDecodeError: + # Can't decode from this position; stop scanning this chunk + break + yield json_obj + cursor_index = end_index + + + def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: + """ + Robust reader that supports: + • True JSONL (one object per line) + • Lines with multiple concatenated JSON objects + • Whole file as a JSON array + + Returns + ------- + Dict[str, Dict[str, Any]] + Mapping doc_id -> full document row. + """ + documents_by_id: Dict[str, Dict[str, Any]] = {} + + with open(path, "r", encoding="utf-8") as file_obj: + content = file_obj.read().strip() + + # Case A: whole-file JSON array + if content.startswith("["): + try: + json_array = json.loads(content) + if isinstance(json_array, list): + for record in json_array: + if not isinstance(record, dict): + continue + document_id = str( + record.get("id") + or record.get("doc_id") + or (record.get("doc") or {}).get("id") + or "" + ) + if document_id: + documents_by_id[document_id] = record + return documents_by_id + except Exception: + # Fall back to line-wise handling if array parsing fails + pass + + # Case B: treat as JSONL-ish; parse *all* objects per line + for raw_line in content.splitlines(): + line = raw_line.strip() + if not line: + continue + for record in self._iter_json_objects(line): + if not isinstance(record, dict): + continue + document_id = str( + record.get("id") + or record.get("doc_id") + or (record.get("doc") or {}).get("id") + or "" + ) + if document_id: + documents_by_id[document_id] = record + + return documents_by_id + + + def _to_text(self, text_field: Any) -> str: + """ + Convert a 'text' field into a single string (handles list-of-strings). + + Parameters + ---------- + text_field : Any + The value found under "text" in the dataset row. + + Returns + ------- + str + A single-string representation of the text. + """ + if isinstance(text_field, str): + return text_field + if isinstance(text_field, list): + return " ".join(str(part) for part in text_field) + return str(text_field) if text_field is not None else "" + + + def _unique_preserve(self, values: List[str]) -> List[str]: + """ + Deduplicate values while preserving the original order. + + Parameters + ---------- + values : List[str] + Sequence possibly containing duplicates. + + Returns + ------- + List[str] + Sequence without duplicates, order preserved. + """ + seen_values: set = set() + ordered_values: List[str] = [] + for candidate in values: + if candidate not in seen_values: + seen_values.add(candidate) + ordered_values.append(candidate) + return ordered_values + + + def _norm(self, text: str) -> str: + """ + Lowercased, single-spaced normalization (for comparisons). + + Parameters + ---------- + text : str + Input string. + + Returns + ------- + str + Normalized string. + """ + return " ".join(text.lower().split()) + + + def _normalize_term(self, term: str) -> str: + """ + Normalization tailored for term keys / lookups. + + Parameters + ---------- + term : str + Term to normalize. + + Returns + ------- + str + Lowercased, trimmed and single-spaced term. + """ + return " ".join(str(term).strip().split()).lower() + + + def _format_fewshot_block( + self, + system_prompt: str, + fewshot_examples: List[Tuple[str, str, List[str]]], + *, + key: str, + k: int = 6, + ) -> str: + """ + Render a few-shot block like: + + + + ### Example + User: + Title: ... + + Assistant: + {"terms": [...]} or {"types": [...]} + + Parameters + ---------- + system_prompt : str + Instructional system text to prepend. + fewshot_examples : List[Tuple[str, str, List[str]]] + Examples as (title, text, labels_list). + key : str + Either "terms" or "types" depending on the task. + k : int + Number of examples to include. + + Returns + ------- + str + Formatted few-shot block text. + """ + lines: List[str] = [system_prompt.strip(), ""] + for example_title, example_text, gold_list in fewshot_examples[:k]: + lines.append("### Example") + lines.append(f"User:\nTitle: {example_title}\n{example_text}") + lines.append(f'Assistant:\n{{"{key}": ' + json.dumps(gold_list, ensure_ascii=False) + "}") + return "\n".join(lines) + + + def _format_user_block(self, title: str, text: str) -> str: + """ + Format the 'Task' block for the current document. + + Parameters + ---------- + title : str + Document title. + text : str + Document text (single string). + + Returns + ------- + str + Formatted user block. + """ + return f"### Task\nUser:\nTitle: {title}\n{text}" + + + def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: + """ + Extract a list from model output, trying: + 1) JSON object with the key ({"terms":[...]} or {"types":[...]}). + 2) Any top-level JSON array. + 3) Fallback: comma-split. + + Parameters + ---------- + generated_text : str + Raw generation text to parse. + key : str + "terms" or "types". + + Returns + ------- + List[str] + Parsed strings (best-effort). + """ + # 1) Try a JSON object and read key + try: + object_match = self._json_object_regex.search(generated_text) + if object_match: + json_obj = json.loads(object_match.group(0)) + json_array = json_obj.get(key) + if isinstance(json_array, list): + return [value for value in json_array if isinstance(value, str)] + except Exception: + pass + + # 2) Any JSON array + try: + array_match = self._json_array_regex.search(generated_text) + if array_match: + json_array = json.loads(array_match.group(0)) + if isinstance(json_array, list): + return [value for value in json_array if isinstance(value, str)] + except Exception: + pass + + # 3) Fallback: comma-split (last resort) + if "," in generated_text: + return [part.strip().strip('"').strip("'") for part in generated_text.split(",") if part.strip()] + return [] + + + def _apply_chat_template_safe_types(self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]]) -> str: + """ + Safely build a prompt string for chat models. Uses the model's chat template + when available; otherwise falls back to a simple concatenation. + """ + try: + return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + except Exception: + system_text = next((m["content"] for m in messages if m.get("role") == "system"), "") + last_user_text = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + return f"{system_text}\n\nUser:\n{last_user_text}\n\nAssistant:" + + + def _build_conv_for_type_infer( + self, + term: str, + few_shot_examples: Optional[List[Dict]] = None, + random_k: Optional[int] = None, + ) -> List[Dict[str, str]]: + """ + Create a chat-style conversation for a single term→types query, + optionally prepending few-shot examples. + """ + messages: List[Dict[str, str]] = [{"role": "system", "content": self._system_prompt_term_to_types}] + examples = list(few_shot_examples or []) + if random_k and len(examples) > random_k: + import random as _rnd + examples = _rnd.sample(examples, random_k) + for exemplar in examples: + example_term = exemplar.get("term", "") + example_types = exemplar.get("types", []) + messages.append({"role": "user", "content": f"Term: {example_term}"}) + messages.append({"role": "assistant", "content": json.dumps({"types": example_types}, ensure_ascii=False)}) + messages.append({"role": "user", "content": f"Term: {term}"}) + return messages + + + def _extract_types_from_text(self, generated_text: str) -> List[str]: + """ + Parse {"types":[...]} from a free-form generation. + """ + try: + object_match = re.search(r'\{[^}]*"types"[^}]*\}', generated_text) + if object_match: + json_obj = json.loads(object_match.group(0)) + types_array = json_obj.get("types", []) + return [type_label for type_label in types_array if isinstance(type_label, str)] + except Exception: + pass + return [] + + + def _load_llm_for_types(self, model_id: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + """ + Load a *separate* small chat model for Term→Types (keeps LocalAutoLLM untouched). + """ + tokenizer = AutoTokenizer.from_pretrained(model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + device_map="auto" if torch.cuda.is_available() else None, + ) + return model, tokenizer + + + def _load_doc_term_extractions( + self, + *, + results_json_path: Optional[str] = None, + in_memory_results: Optional[List[Dict]] = None, + ) -> List[Dict]: + """ + Normalize document→terms outputs to a list of: + {"id": "", "extracted_terms": ["...", ...]} + + Accepts either: + - in_memory_results (list of dicts) + - results_json_path pointing to: + • a JSONL file with lines: {"id": "...", "terms": [...]} + • OR a JSON file with {"results":[{"id":..., "extracted_terms": [...]}, ...]} + • OR a JSON list of dicts + """ + normalized_records: List[Dict] = [] + + def _coerce_to_record(source_row: Dict) -> Optional[Dict]: + document_id = str(source_row.get("id", "")) or str(source_row.get("doc_id", "")) + if not document_id: + return None + terms = source_row.get("extracted_terms") + if terms is None: + terms = source_row.get("terms") + if terms is None and "payload" in source_row and isinstance(source_row["payload"], dict): + terms = source_row["payload"].get("terms") + if not isinstance(terms, list): + terms = [] + return {"id": document_id, "extracted_terms": [t for t in terms if isinstance(t, str)]} + + if in_memory_results is not None: + for source_row in in_memory_results: + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + return normalized_records + + if not results_json_path: + raise ValueError("Provide results_json_path or in_memory_results") + + # Detect JSON vs JSONL by extension (best-effort) + if results_json_path.endswith(".jsonl"): + with open(results_json_path, "r", encoding="utf-8") as file_in: + for raw_line in file_in: + raw_line = raw_line.strip() + if not raw_line: + continue + # Multiple concatenated objects per line? Iterate them all. + for json_obj in self._iter_json_objects(raw_line): + if isinstance(json_obj, dict): + coerced_record = _coerce_to_record(json_obj) + if coerced_record: + normalized_records.append(coerced_record) + else: + payload_obj = self._load_json(results_json_path) + if isinstance(payload_obj, dict) and "results" in payload_obj: + for source_row in payload_obj["results"]: + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + elif isinstance(payload_obj, list): + for source_row in payload_obj: + if isinstance(source_row, dict): + coerced_record = _coerce_to_record(source_row) + if coerced_record: + normalized_records.append(coerced_record) + + return normalized_records + + + def _collect_unique_terms_from_extractions(self, doc_term_extractions: List[Dict]) -> List[str]: + """ + Collect unique terms (original casing) from normalized document→terms results. + """ + seen_normalized_terms: set = set() + ordered_unique_terms: List[str] = [] + for record in doc_term_extractions: + for term_text in record.get("extracted_terms", []): + normalized = self._normalize_term(term_text) + if normalized and normalized not in seen_normalized_terms: + seen_normalized_terms.add(normalized) + ordered_unique_terms.append(term_text.strip()) + return ordered_unique_terms + + + def _build_term_to_doc_ids(self, doc_term_extractions: List[Dict]) -> Dict[str, List[str]]: + """ + Build lookup: normalized_term -> sorted unique list of doc_ids. + """ + term_to_doc_set: Dict[str, set] = {} + for record in doc_term_extractions: + document_id = str(record.get("id", "")) + for term_text in record.get("extracted_terms", []): + normalized = self._normalize_term(term_text) + if not normalized or not document_id: + continue + term_to_doc_set.setdefault(normalized, set()).add(document_id) + return {normalized_term: sorted(doc_ids) for normalized_term, doc_ids in term_to_doc_set.items()} From 1abbbc91e7c65321da0f25f1f41b190c3776986d Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Mon, 10 Nov 2025 23:52:46 +0100 Subject: [PATCH 5/7] added changes for taxonomy discovery and term typing --- .../llm_learner_alexbek_rag_term_typing.py | 10 +- .../llm_learner_alexbek_rf_term_typing.py | 24 +- ...er_alexbek_self_attn_taxonomy_discovery.py | 9 +- examples/llm_learner_alexbek_text2onto.py | 32 +- ...llm_learner_rwthdbis_taxonomy_discovery.py | 17 +- examples/llm_learner_rwthdbis_term_typing.py | 13 +- ...lm_learner_sbunlp_fs_taxonomy_discovery.py | 33 +- examples/llm_learner_sbunlp_text2onto.py | 29 +- examples/llm_learner_sbunlp_zs_term_typing.py | 20 +- ..._learner_skhnlp_sft_taxonomoy_discovery.py | 10 +- ...m_learner_skhnlp_zs_taxonomoy_discovery.py | 13 +- .../learner/taxonomy_discovery/__init__.py | 18 - .../learner/taxonomy_discovery/alexbek.py | 291 +++++- .../learner/taxonomy_discovery/rwthdbis.py | 922 ++++++++++++------ .../learner/taxonomy_discovery/sbunlp.py | 393 +++++--- .../learner/taxonomy_discovery/skhnlp.py | 561 +++++++++-- ontolearner/learner/term_typing/__init__.py | 17 - ontolearner/learner/term_typing/alexbek.py | 665 +++++++++++-- ontolearner/learner/term_typing/rwthdbis.py | 214 +++- ontolearner/learner/term_typing/sbunlp.py | 404 ++++---- ontolearner/learner/text2onto/__init__.py | 16 - ontolearner/learner/text2onto/alexbek.py | 293 ++++-- ontolearner/learner/text2onto/sbunlp.py | 127 ++- 23 files changed, 2956 insertions(+), 1175 deletions(-) delete mode 100644 ontolearner/learner/taxonomy_discovery/__init__.py delete mode 100644 ontolearner/learner/term_typing/__init__.py delete mode 100644 ontolearner/learner/text2onto/__init__.py diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py index 5723e36..3a3233f 100644 --- a/examples/llm_learner_alexbek_rag_term_typing.py +++ b/examples/llm_learner_alexbek_rag_term_typing.py @@ -1,13 +1,15 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import AlexbekRAGLearner +from ontolearner.learner.term_typing.alexbek import AlexbekRAGLearner # Load the GeoNames ontology. ontology = GeoNames() ontology.load() # Extract labeled items and split into train/test sets for evaluation -train_data, test_data = train_test_split(ontology.extract(), test_size=0.2, random_state=42) +train_data, test_data = train_test_split( + ontology.extract(), test_size=0.2, random_state=42 +) # Configure a Retrieval-Augmented Generation (RAG) term-typing classifier. # - llm_model_id: generator used to predict types from the prompt + retrieved examples @@ -46,5 +48,7 @@ ) # Display the evaluation results and runtime -print("Metrics:", outputs.get("metrics")) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} +print( + "Metrics:", outputs.get("metrics") +) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...} print("Elapsed time (s):", outputs.get("elapsed_time")) diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py index c5c7454..28ca94c 100644 --- a/examples/llm_learner_alexbek_rf_term_typing.py +++ b/examples/llm_learner_alexbek_rf_term_typing.py @@ -1,6 +1,8 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import AlexbekRFLearner # A random-forest term-typing learner over text+graph features +from ontolearner.learner.term_typing.alexbek import ( + AlexbekRFLearner, +) # A random-forest term-typing learner over text+graph features # Load the GeoNames ontology and extract labeled term-typing data @@ -10,31 +12,27 @@ data = ontology.extract() # Split the labeled term-typing data into train and test sets -train_data, test_data = train_test_split( - data, - test_size=0.2, - random_state=42 -) +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) # Configure the RF-based learner (embeddings + optional graph features) # - device: "cpu" or "cuda" # - threshold: decision threshold for multi-label assignment # - use_graph_features: include ontology-graph-derived features if available rf_learner = AlexbekRFLearner( - device="cpu", # switch to "cuda" if you have a GPU + device="cpu", # switch to "cuda" if you have a GPU batch_size=16, - max_length=512, # max tokenizer length for embedding model inputs - threshold=0.30, # probability cutoff for assigning each type - use_graph_features=True # set False for pure RF on text embeddings only + max_length=512, # max tokenizer length for embedding model inputs + threshold=0.30, # probability cutoff for assigning each type + use_graph_features=True, # set False for pure RF on text embeddings only ) # Build the pipeline and pass raw structured objects end-to-end. pipe = LearnerPipeline( retriever=rf_learner, - retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory - ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} + retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory + ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...} device="cpu", - batch_size=16 + batch_size=16, ) # Run the full learning pipeline on the term-typing task diff --git a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py index b78976f..6a42160 100644 --- a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py +++ b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py @@ -1,5 +1,6 @@ from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import AlexbekCrossAttnLearner +from ontolearner.learner.taxonomy_discovery.alexbek import AlexbekCrossAttnLearner + # 1) Load & split ontology = GeoNames() ontology.load() @@ -22,9 +23,9 @@ # 3) Build pipeline pipeline = LearnerPipeline( - llm=cross_learner, # <- our learner - llm_id="cross-attn", # label for bookkeeping - ontologizer_data=False # pass raw ontology objects as in your example + llm=cross_learner, # <- our learner + llm_id="cross-attn", # label for bookkeeping + ontologizer_data=False, # pass raw ontology objects as in your example ) # 4) Train + predict + evaluate diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py index caf4c5b..69282a9 100644 --- a/examples/llm_learner_alexbek_text2onto.py +++ b/examples/llm_learner_alexbek_text2onto.py @@ -9,14 +9,22 @@ DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology" # Input paths (already saved) -TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") -TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") -TEST_DOCS_FULL_PATH = os.path.join(DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl") +TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl") +TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json") +TEST_DOCS_FULL_PATH = os.path.join( + DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl" +) # Output paths -DOC_TERMS_OUT_PATH = os.path.join(DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl") -TERMS2TYPES_OUT_PATH = os.path.join(DATA_DIR, "test", "terms2types_pred_ecology.fast.json") -TYPES2DOCS_OUT_PATH = os.path.join(DATA_DIR, "test", "types2docs_pred_ecology.fast.json") +DOC_TERMS_OUT_PATH = os.path.join( + DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl" +) +TERMS2TYPES_OUT_PATH = os.path.join( + DATA_DIR, "test", "terms2types_pred_ecology.fast.json" +) +TYPES2DOCS_OUT_PATH = os.path.join( + DATA_DIR, "test", "types2docs_pred_ecology.fast.json" +) # Device selection DEVICE = ( @@ -27,7 +35,7 @@ # Model config MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" -LOAD_IN_4BIT = (DEVICE == "cuda") # 4-bit helps on GPU +LOAD_IN_4BIT = DEVICE == "cuda" # 4-bit helps on GPU # 1) Load LLM llm = LocalAutoLLM(device=DEVICE) @@ -52,15 +60,17 @@ # 4) Predict types for extracted terms, using the JSONL we just wrote typing_summary = learner.predict_types_from_terms( - doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly - doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) - model_id=MODEL_ID, # reuse the same small model + doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly + doc_terms_list=None, # (not needed when doc_terms_jsonl is provided) + model_id=MODEL_ID, # reuse the same small model out_terms2types=TERMS2TYPES_OUT_PATH, out_types2docs=TYPES2DOCS_OUT_PATH, # use defaults for everything else ) -print(f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types") +print( + f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types" +) print(f"[saved] {TERMS2TYPES_OUT_PATH}") print(f"[saved] {TYPES2DOCS_OUT_PATH}") diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py index fea5539..4412c5f 100644 --- a/examples/llm_learner_rwthdbis_taxonomy_discovery.py +++ b/examples/llm_learner_rwthdbis_taxonomy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library -from ontolearner import LearnerPipeline, train_test_split -from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner +from ontolearner import LearnerPipeline, train_test_split, ChordOntology +from ontolearner.learner.taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner # Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery ontology = ChordOntology() @@ -8,17 +8,16 @@ # Extract typed taxonomic edges and split into train/test while preserving the structured shape train_data, test_data = train_test_split( - ontology.extract(), - test_size=0.2, - random_state=42 + ontology.extract(), test_size=0.2, random_state=42 ) # Initialize a supervised taxonomy classifier (encoder-based fine-tuning) # Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views # Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions -learner = RWTHDBISTaxonomyLearner( +learner = RWTHDBISSFTLearner( model_name="microsoft/deberta-v3-small", output_dir="./results/", + device="cpu", num_train_epochs=1, per_device_train_batch_size=8, gradient_accumulation_steps=4, @@ -48,10 +47,12 @@ ) # Display the evaluation results -print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} +print( + "Metrics:", outputs["metrics"] +) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} # Display total elapsed time for training + prediction + evaluation -print("Elapsed time:", outputs['elapsed_time']) +print("Elapsed time:", outputs["elapsed_time"]) # Print all returned outputs (include predictions) print(outputs) diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py index 67d207f..d9bdc4b 100644 --- a/examples/llm_learner_rwthdbis_term_typing.py +++ b/examples/llm_learner_rwthdbis_term_typing.py @@ -1,8 +1,8 @@ # Import core modules from the OntoLearner library from ontolearner import LearnerPipeline, train_test_split, AgrO -from ontolearner import RWTHDBISTermTypingLearner +from ontolearner.learner.term_typing.rwthdbis import RWTHDBISSFTLearner -#load the AgrO ontology. +# load the AgrO ontology. # AgrO provides term-typing supervision where each term can be annotated with one or more types. ontology = AgrO() ontology.load() @@ -13,9 +13,10 @@ # Configure a supervised encoder-based classifier for term typing. # This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results. -learner = RWTHDBISTermTypingLearner( +learner = RWTHDBISSFTLearner( model_name="microsoft/deberta-v3-small", output_dir="./results/deberta-v3", + device="cpu", num_train_epochs=30, per_device_train_batch_size=16, gradient_accumulation_steps=2, @@ -41,10 +42,12 @@ ) # Display the evaluation results -print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} +print( + "Metrics:", outputs["metrics"] +) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...} # Display total elapsed time for training + prediction + evaluation -print("Elapsed time:", outputs['elapsed_time']) +print("Elapsed time:", outputs["elapsed_time"]) # Print all returned outputs (include predictions) print(outputs) diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py index 19797a9..2200892 100644 --- a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py +++ b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py @@ -1,19 +1,22 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline + # Import the specific Few-Shot Learner implementation -from ontolearner import SBUNLPFewShotLearner +from ontolearner.learner.taxonomy_discovery.sbunlp import SBUNLPFewShotLearner # Load ontology and split # Load the GeoNames ontology for taxonomy discovery. # GeoNames provides geographic parent-child relationships (is-a hierarchy). ontology = GeoNames() ontology.load() -data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object +data = ( + ontology.extract() +) # Extract the list of taxonomic relationships from the ontology object # Split the taxonomic relationships into train and test sets train_data, test_data = train_test_split( data, - test_size=0.6, # 60% of data used for testing (terms to find relations for) + test_size=0.6, # 60% of data used for testing (terms to find relations for) random_state=42, ) @@ -22,19 +25,17 @@ # This performs in-context learning via N x M batch prompting. llm_learner = SBUNLPFewShotLearner( # Model / decoding - model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load - try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency - max_new_tokens=140, # limit the length of the model's response (for JSON output) - max_input_tokens=1500, # limit the total prompt length (context window) - temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) - top_p=1.0, # top-p sampling disabled with temperature=0.0 - + model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load + try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency + max_new_tokens=140, # limit the length of the model's response (for JSON output) + max_input_tokens=1500, # limit the total prompt length (context window) + temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON) + top_p=1.0, # top-p sampling disabled with temperature=0.0 # Grid settings (N x M prompts) - n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks - m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) - + n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks + m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts) # Run controls - limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run + limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging ) @@ -43,8 +44,8 @@ pipe = LearnerPipeline( llm=llm_learner, llm_id=llm_learner.model_name, - ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers - device="auto", # automatically select CUDA or CPU + ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers + device="auto", # automatically select CUDA or CPU ) # Run the full learning pipeline on the taxonomy-discovery task diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py index 564f641..cff543c 100644 --- a/examples/llm_learner_sbunlp_text2onto.py +++ b/examples/llm_learner_sbunlp_text2onto.py @@ -1,6 +1,7 @@ import os import torch -#Import all the required classes + +# Import all the required classes from ontolearner import SBUNLPText2OntoLearner from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM @@ -11,8 +12,8 @@ # Ensure the base directories exist # Creates the train and test subdirectories if they don't already exist. -os.makedirs(os.path.join(LOCAL_DATA_DIR, 'train'), exist_ok=True) -os.makedirs(os.path.join(LOCAL_DATA_DIR, 'test'), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, "train"), exist_ok=True) +os.makedirs(os.path.join(LOCAL_DATA_DIR, "test"), exist_ok=True) # Define local file paths: POINTING TO ALREADY SAVED FILES # These files are used as input for the Fit and Predict phases. @@ -22,10 +23,14 @@ # Output files for predictions (saved directly under LOCAL_DATA_DIR/test) # These files will be created by the predict_terms/types methods. -TERMS_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" -TYPES_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" +TERMS_PRED_OUT = ( + "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl" +) +TYPES_PRED_OUT = ( + "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl" +) -#Initialize and Load Learner --- +# Initialize and Load Learner --- MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Determine the device for inference (GPU or CPU) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" @@ -47,7 +52,7 @@ train_docs_jsonl=DOCS_ALL_PATH, terms2doc_json=TERMS2DOC_PATH, sample_size=28, - seed=123 # Seed for stratified random sampling stability + seed=123, # Seed for stratified random sampling stability ) MAX_NEW_TOKENS = 100 @@ -55,7 +60,7 @@ terms_written = learner.predict_terms( docs_test_jsonl=DOCS_TEST_PATH, out_jsonl=TERMS_PRED_OUT, - max_new_tokens=MAX_NEW_TOKENS + max_new_tokens=MAX_NEW_TOKENS, ) print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.") @@ -63,7 +68,7 @@ types_written = learner.predict_types( docs_test_jsonl=DOCS_TEST_PATH, out_jsonl=TYPES_PRED_OUT, - max_new_tokens=MAX_NEW_TOKENS + max_new_tokens=MAX_NEW_TOKENS, ) print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.") @@ -77,5 +82,7 @@ print(f"Final Type Extraction F1: {f1_type:.4f}") except Exception as e: - # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) - print(f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created.") + # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError) + print( + f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created." + ) diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py index 75d01da..54c070c 100644 --- a/examples/llm_learner_sbunlp_zs_term_typing.py +++ b/examples/llm_learner_sbunlp_zs_term_typing.py @@ -1,30 +1,30 @@ # Import core modules from the OntoLearner library from ontolearner import AgrO, train_test_split, LearnerPipeline + # Import the specific Zero-Shot Learner implementation for Term Typing -from ontolearner import SBUNLPZSLearner +from ontolearner.learner.term_typing.sbunlp import SBUNLPZSLearner # Load ontology and split # Load the AgrO ontology for type inventory and test data. ontology = AgrO() ontology.load() -data = ontology.extract() # Extract the full set of relationships/terms +data = ontology.extract() # Extract the full set of relationships/terms # Split the data into train (to learn type inventory) and test (terms to predict) train_data, test_data = train_test_split( data, - test_size=0.6, # 60% of data used for testing + test_size=0.6, # 60% of data used for testing random_state=42, ) # Configure the Qwen Zero-Shot learner (inference-only) # This learner's 'fit' phase learns the vocabulary of allowed type labels. llm_learner = SBUNLPZSLearner( - # Model / decoding - model_id="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load - # device= is auto-detected - max_new_tokens=64, # Sufficient length for JSON list of types - temperature=0.0, # Ensures deterministic (greedy) output - # token= None, # Assuming public model access + device="cpu", + max_new_tokens=64, + temperature=0.0, + model_id="Qwen/Qwen2.5-0.5B-Instruct", + token=None, ) # Build pipeline and run @@ -33,7 +33,7 @@ llm=llm_learner, llm_id=llm_learner.model_id, ontologizer_data=False, - device="cpu", # select CUDA or CPU + device="cpu", # select CUDA or CPU ) # Run the full learning pipeline on the Term-Typing task diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py index 3661a5b..5c87925 100644 --- a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py +++ b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py @@ -1,6 +1,6 @@ # Import core modules from the OntoLearner library from ontolearner import GeoNames, train_test_split, LearnerPipeline -from ontolearner import SKHNLPSequentialFTLearner +from ontolearner.learner.taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner # Load ontology and split # Load the GeoNames ontology for taxonomy discovery. @@ -10,11 +10,7 @@ data = ontology.extract() # Split the taxonomic relationships into train and test sets -train_data, test_data = train_test_split( - data, - test_size=0.2, - random_state=42 -) +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) # Configure the learner with user-defined training args + device # Configure the supervised BERT SFT Learner for taxonomy discovery. @@ -23,7 +19,7 @@ model_name="bert-large-uncased", n_prompts=2, random_state=1403, - device="cpu", # Note: CPU training for BERT-Large is very slow. + device="cpu", # Note: CPU training for BERT-Large is very slow. output_dir="./results/", num_train_epochs=1, per_device_train_batch_size=8, diff --git a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py index 90391f5..fec0ddd 100644 --- a/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py +++ b/examples/llm_learner_skhnlp_zs_taxonomoy_discovery.py @@ -1,7 +1,8 @@ # Import core modules from the OntoLearner library -from ontolearner import GeoNames, train_test_split, LearnerPipeline, SKHNLPZSLearner +from ontolearner import GeoNames, train_test_split, LearnerPipeline +from ontolearner.learner.taxonomy_discovery.skhnlp import SKHNLPZSLearner -#Load ontology and split data +# Load ontology and split data # The GeoNames ontology provides geographic term types and relationships. ontology = GeoNames() ontology.load() @@ -16,11 +17,11 @@ # This model uses a fixed prompt and string normalization (Levenshtein) to classify terms. llm_learner = SKHNLPZSLearner( model_name="Qwen/Qwen2.5-0.5B-Instruct", - device="cpu", # use "cuda" if you have a GPU + device="cpu", # use "cuda" if you have a GPU max_new_tokens=16, - save_path="./outputs/", # directory or full file path for CSV + save_path="./outputs/", # directory or full file path for CSV verbose=True, - normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" + normalize_mode="levenshtein", # "none" | "substring" | "levenshtein" | "auto" ) # Build pipeline and run @@ -33,7 +34,7 @@ # Run the full learning pipeline on the taxonomy-discovery task outputs = pipe( - train_data=train_data, # zero-shot; ignored by the LLM learner + train_data=train_data, # zero-shot; ignored by the LLM learner test_data=test_data, task="taxonomy-discovery", evaluate=True, diff --git a/ontolearner/learner/taxonomy_discovery/__init__.py b/ontolearner/learner/taxonomy_discovery/__init__.py deleted file mode 100644 index 57a845b..0000000 --- a/ontolearner/learner/taxonomy_discovery/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .rwthdbis import RWTHDBISSFTLearner -from .skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner -from .sbunlp import SBUNLPFewShotLearner -from .alexbek import AlexbekCrossAttnLearner diff --git a/ontolearner/learner/taxonomy_discovery/alexbek.py b/ontolearner/learner/taxonomy_discovery/alexbek.py index 616d50f..3623f16 100644 --- a/ontolearner/learner/taxonomy_discovery/alexbek.py +++ b/ontolearner/learner/taxonomy_discovery/alexbek.py @@ -24,33 +24,70 @@ from ...base import AutoLearner + class RMSNorm(nn.Module): """Root Mean Square normalization with learnable scale. - Computes: y = weight * x / sqrt(mean(x^2) + eps) + Computes per-position normalization: + y = weight * x / sqrt(mean(x^2) + eps) + + This variant normalizes over the last dimension and keeps scale as a + learnable parameter, similar to RMSNorm used in modern transformer stacks. """ def __init__(self, dim: int, eps: float = 1e-6): + """Initialize the RMSNorm layer. + + Args: + dim: Size of the last (feature) dimension to normalize over. + eps: Small constant added inside the square root for numerical + stability. + """ super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) def forward(self, x: torch.Tensor) -> torch.Tensor: + """Apply RMS normalization. + + Args: + x: Input tensor of shape (..., dim). + + Returns: + Tensor of the same shape as `x`, RMS-normalized over the last axis. + """ rms_inv = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) return self.weight * (x * rms_inv) + class CrossAttentionHead(nn.Module): """Minimal multi-head *pair* scorer using cross-attention-style projections. - Given child vector c and parent vector p: - q = Wq * c, k = Wk * p - per-head score = (q_h · k_h) / sqrt(d_head) - aggregate by mean across heads, then sigmoid to get probability. + Given child vector `c` and parent vector `p`: + q = W_q * c, k = W_k * p + score_head = (q_h · k_h) / sqrt(d_head) + + We average the per-head scores and apply a sigmoid to produce a probability. + This is not a full attention block—just a learnable similarity function. """ - def __init__(self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6): + def __init__( + self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1e-6 + ): + """Initialize projections and per-stream normalizers. + + Args: + hidden_size: Dimensionality of input embeddings (child/parent). + num_heads: Number of subspaces to split the projection into. + rms_norm_eps: Epsilon for RMSNorm stability. + + Raises: + AssertionError: If `hidden_size` is not divisible by `num_heads`. + """ super().__init__() - assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads" + assert hidden_size % num_heads == 0, ( + "hidden_size must be divisible by num_heads" + ) self.hidden_size = hidden_size self.num_heads = num_heads self.dim_per_head = hidden_size // num_heads @@ -67,14 +104,17 @@ def __init__(self, hidden_size: int, num_heads: int = 8, rms_norm_eps: float = 1 nn.init.xavier_uniform_(self.query_projection.weight) nn.init.xavier_uniform_(self.key_projection.weight) - def forward(self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor) -> torch.Tensor: + def forward( + self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tensor + ) -> torch.Tensor: """Score (child, parent) pairs. Args: - child_embeddings: Tensor of shape (batch, hidden_size) - parent_embeddings: Tensor of shape (batch, hidden_size) + child_embeddings: Tensor of shape (batch, hidden_size). + parent_embeddings: Tensor of shape (batch, hidden_size). + Returns: - Tensor of probabilities with shape (batch,) + Tensor of probabilities with shape (batch,), each in [0, 1]. """ batch_size, _ = child_embeddings.shape @@ -95,14 +135,17 @@ def forward(self, child_embeddings: torch.Tensor, parent_embeddings: torch.Tenso # Map to probability return torch.sigmoid(mean_score) + class AlexbekCrossAttnLearner(AutoLearner): """Cross-Attention Taxonomy Learner (inherits AutoLearner). - - Encodes type strings with a SentenceTransformer. - - Trains a small cross-attention head to score (parent, child) edges. - - Predicts probabilities for provided pairs. + Workflow + - Encode terms with a SentenceTransformer. + - Train a compact cross-attention head on (parent, child) pairs + (positives + sampled negatives) using BCE loss. + - Inference returns probabilities per pair; edges with prob >= 0.5 are + labeled as positive. - Helper functions live in this same module (below), *not* as class methods. """ def __init__( @@ -122,11 +165,26 @@ def __init__( ): """Configure the learner. - All configuration is kept directly on the learner (no separate Config class). + Args: + embedding_model: SentenceTransformer model id/path for term encoding. + device: 'cuda' or 'cpu'. If 'cuda' is requested but unavailable, CPU + is used. + num_heads: Number of heads in the cross-attention scorer. + lr: Learning rate for AdamW. + weight_decay: Weight decay for AdamW. + num_epochs: Number of epochs to train the head. + batch_size: Minibatch size for training and scoring loops. + neg_ratio: Number of sampled negatives per positive during training. + output_dir: Directory to store artifacts (reserved for future use). + seed: Random seed for reproducibility. + **kwargs: Passed through to `AutoLearner` base init. + + Side Effects: + Creates `output_dir` if missing and seeds Python/Torch RNGs. """ super().__init__(**kwargs) - # ----- hyperparameters / settings ----- + # hyperparameters / settings self.embedding_model_id = embedding_model self.requested_device = device self.num_heads = num_heads @@ -157,25 +215,62 @@ def __init__( torch.manual_seed(self.seed) def load(self, **kwargs: Any): - """Load the sentence embedding model and initialize the cross-attention head.""" + """Load the sentence embedding model and initialize the cross-attention head. + + Args: + **kwargs: Optional override, supports `embedding_model`. + + Side Effects: + - Initializes `self.embedder` on the configured device. + - Probes and stores `self.embedding_dim`. + - Constructs `self.cross_attn_head` with the probed dimensionality. + """ model_id = kwargs.get("embedding_model", self.embedding_model_id) - self.embedder = SentenceTransformer(model_id, trust_remote_code=True, device=str(self.device)) + self.embedder = SentenceTransformer( + model_id, trust_remote_code=True, device=str(self.device) + ) # Probe output dimensionality using a dummy encode - probe_embedding = self.embedder.encode(["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False) + probe_embedding = self.embedder.encode( + ["_dim_probe_"], convert_to_tensor=True, normalize_embeddings=False + ) self.embedding_dim = int(probe_embedding.shape[-1]) # Initialize the cross-attention head - self.cross_attn_head = CrossAttentionHead(hidden_size=self.embedding_dim, num_heads=self.num_heads).to( - self.device - ) + self.cross_attn_head = CrossAttentionHead( + hidden_size=self.embedding_dim, num_heads=self.num_heads + ).to(self.device) def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """Train or infer taxonomy edges according to the AutoLearner contract. + + Training (`test=False`) + - Extract positives (parent, child) and the unique term set from `data`. + - Build/extend the term embedding cache. + - Sample negatives at ratio `self.negative_ratio`. + - Train the cross-attention head with BCE loss. + + Inference (`test=True`) + - Ensure embeddings exist for all terms. + - Score candidate pairs and return per-pair probabilities and labels. + + Args: + data: Ontology-like object exposing `type_taxonomies.taxonomies`, + where each item has `.parent` and `.child` string-like fields. + test: If True, perform inference instead of training. + + Returns: + - `None` on training. + - On inference: List of dicts + `{"parent": str, "child": str, "score": float, "label": int}`. + """ if self.embedder is None or self.cross_attn_head is None: self.load() if not test: - positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + positive_pairs, unique_terms = self._extract_parent_child_pairs_and_terms( + data + ) self._ensure_term_embeddings(unique_terms) negative_pairs = self._sample_negative_pairs( positive_pairs, unique_terms, ratio=self.negative_ratio, seed=self.seed @@ -183,27 +278,42 @@ def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: self._train_cross_attn_head(positive_pairs, negative_pairs) return None else: - candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms(data) + candidate_pairs, unique_terms = self._extract_parent_child_pairs_and_terms( + data + ) self._ensure_term_embeddings(unique_terms, append_only=True) probabilities = self._score_parent_child_pairs(candidate_pairs) predictions = [ - {"parent": parent, "child": child, "score": float(prob), "label": int(prob >= 0.5)} + { + "parent": parent, + "child": child, + "score": float(prob), + "label": int(prob >= 0.5), + } for (parent, child), prob in zip(candidate_pairs, probabilities) ] return predictions - def _ensure_term_embeddings(self, terms: List[str], append_only: bool = False) -> None: + def _ensure_term_embeddings( + self, terms: List[str], append_only: bool = False + ) -> None: """Encode terms with the sentence embedder and store in cache. Args: - terms: list of unique strings to embed - append_only: if True, only embed terms missing from cache + terms: List of unique term strings to embed. + append_only: If True, only embed terms missing from the cache; + otherwise (re)encode all provided terms. + + Raises: + RuntimeError: If called before `load()`. """ if self.embedder is None: raise RuntimeError("Call load() before building term embeddings") - terms_to_encode = [t for t in terms if t not in self.term_to_vector] if append_only else terms + terms_to_encode = ( + [t for t in terms if t not in self.term_to_vector] if append_only else terms + ) if not terms_to_encode: return @@ -217,38 +327,78 @@ def _ensure_term_embeddings(self, terms: List[str], append_only: bool = False) - for term, embedding in zip(terms_to_encode, embeddings): self.term_to_vector[term] = embedding.detach().to(self.device) - def _pairs_as_tensors(self, pairs: List[Tuple[str, str]]) -> Tuple[torch.Tensor, torch.Tensor]: - """Turn list of (parent, child) strings into two aligned tensors on device.""" + def _pairs_as_tensors( + self, pairs: List[Tuple[str, str]] + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Convert string pairs into aligned embedding tensors on the correct device. + + Args: + pairs: List of (parent, child) term strings. + + Returns: + Tuple `(child_tensor, parent_tensor)` where each tensor has shape + `(batch, embedding_dim)` and is located on `self.device`. + + Notes: + This function assumes that all terms in `pairs` are present in + `self.term_to_vector`. Use `_ensure_term_embeddings` beforehand. + """ # child embeddings tensor of shape (batch, dim) - child_tensor = torch.stack([self.term_to_vector[child] for (_, child) in pairs], dim=0).to(self.device) + child_tensor = torch.stack( + [self.term_to_vector[child] for (_, child) in pairs], dim=0 + ).to(self.device) # parent embeddings tensor of shape (batch, dim) - parent_tensor = torch.stack([self.term_to_vector[parent] for (parent, _) in pairs], dim=0).to(self.device) + parent_tensor = torch.stack( + [self.term_to_vector[parent] for (parent, _) in pairs], dim=0 + ).to(self.device) return child_tensor, parent_tensor - def _train_cross_attn_head(self, positive_pairs: List[Tuple[str, str]], negative_pairs: List[Tuple[str, str]]) -> None: - """Train the cross-attention head with BCE loss on labeled pairs.""" + def _train_cross_attn_head( + self, + positive_pairs: List[Tuple[str, str]], + negative_pairs: List[Tuple[str, str]], + ) -> None: + """Train the cross-attention head with BCE loss on labeled pairs. + + The dataset is a concatenation of positives (label 1) and sampled + negatives (label 0). The head is optimized with AdamW. + + Args: + positive_pairs: List of ground-truth (parent, child) edges. + negative_pairs: List of sampled non-edges. + + Raises: + RuntimeError: If the head has not been initialized (call `load()`). + """ if self.cross_attn_head is None: raise RuntimeError("Head not initialized. Call load().") self.cross_attn_head.train() optimizer = torch.optim.AdamW( - self.cross_attn_head.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay + self.cross_attn_head.parameters(), + lr=self.learning_rate, + weight_decay=self.weight_decay, ) # Build a simple supervised dataset: 1 for positive, 0 for negative - labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [(1, pc) for pc in positive_pairs] + [ - (0, nc) for nc in negative_pairs - ] + labeled_pairs: List[Tuple[int, Tuple[str, str]]] = [ + (1, pc) for pc in positive_pairs + ] + [(0, nc) for nc in negative_pairs] random.shuffle(labeled_pairs) - def iterate_minibatches(items: List[Tuple[int, Tuple[str, str]]], batch_size: int): + def iterate_minibatches( + items: List[Tuple[int, Tuple[str, str]]], batch_size: int + ): + """Yield contiguous minibatches of size `batch_size` from `items`.""" for start in range(0, len(items), batch_size): yield items[start : start + batch_size] for epoch in range(self.num_epochs): epoch_loss_sum = 0.0 for minibatch in iterate_minibatches(labeled_pairs, self.batch_size): - labels = torch.tensor([y for y, _ in minibatch], dtype=torch.float32, device=self.device) + labels = torch.tensor( + [y for y, _ in minibatch], dtype=torch.float32, device=self.device + ) string_pairs = [pc for _, pc in minibatch] child_tensor, parent_tensor = self._pairs_as_tensors(string_pairs) @@ -261,9 +411,18 @@ def iterate_minibatches(items: List[Tuple[int, Tuple[str, str]]], batch_size: in epoch_loss_sum += float(loss.item()) * len(minibatch) - def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float]: - """Compute probability scores for (parent, child) pairs.""" + """Compute probability scores for (parent, child) pairs. + + Args: + pairs: List of candidate (parent, child) edges to score. + + Returns: + List of floats in [0, 1] corresponding to the input order. + + Raises: + RuntimeError: If the head has not been initialized (call `load()`). + """ if self.cross_attn_head is None: raise RuntimeError("Head not initialized. Call load().") @@ -277,8 +436,23 @@ def _score_parent_child_pairs(self, pairs: List[Tuple[str, str]]) -> List[float] scores.extend(prob.detach().cpu().tolist()) return scores - def _extract_parent_child_pairs_and_terms(self, data): - parent_child_pairs = [] + def _extract_parent_child_pairs_and_terms( + self, data: Any + ) -> Tuple[List[Tuple[str, str]], List[str]]: + """Extract (parent, child) edges and the set of unique terms from an ontology-like object. + + The function expects `data.type_taxonomies.taxonomies` to be an iterable + of objects with `.parent` and `.child` string-like attributes. + + Args: + data: Ontology-like container. + + Returns: + A tuple `(pairs, terms)` where: + - `pairs` is a list of (parent, child) strings, + - `terms` is a sorted list of unique term strings (parents ∪ children). + """ + parent_child_pairs: List[Tuple[str, str]] = [] unique_terms = set() for edge in getattr(data, "type_taxonomies").taxonomies: parent, child = str(edge.parent), str(edge.child) @@ -287,11 +461,32 @@ def _extract_parent_child_pairs_and_terms(self, data): unique_terms.add(child) return parent_child_pairs, sorted(unique_terms) - def _sample_negative_pairs(self, positive_pairs, terms, ratio: float = 1.0, seed: int = 42): + def _sample_negative_pairs( + self, + positive_pairs: List[Tuple[str, str]], + terms: List[str], + ratio: float = 1.0, + seed: int = 42, + ) -> List[Tuple[str, str]]: + """Sample random negative (parent, child) pairs not present in positives. + + Sampling is uniform over the Cartesian product of `terms` excluding + (x, x) self-pairs and any pair found in `positive_pairs`. + + Args: + positive_pairs: Known positive edges to exclude. + terms: Candidate vocabulary (parents ∪ children). + ratio: Number of negatives per positive to draw. + seed: RNG seed used for reproducible sampling. + + Returns: + A list of sampled negative pairs of approximate length + `int(len(positive_pairs) * ratio)`. + """ random.seed(seed) term_list = list(terms) positive_set = set(positive_pairs) - negatives = [] + negatives: List[Tuple[str, str]] = [] target_negative_count = int(len(positive_pairs) * ratio) while len(negatives) < target_negative_count: parent = random.choice(term_list) diff --git a/ontolearner/learner/taxonomy_discovery/rwthdbis.py b/ontolearner/learner/taxonomy_discovery/rwthdbis.py index 47989c5..c535016 100644 --- a/ontolearner/learner/taxonomy_discovery/rwthdbis.py +++ b/ontolearner/learner/taxonomy_discovery/rwthdbis.py @@ -16,9 +16,7 @@ import os import random import re -import time import platform -import multiprocessing from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Callable @@ -39,16 +37,45 @@ from ...base import AutoLearner + class RWTHDBISSFTLearner(AutoLearner): """ Supervised classifier for (parent, child) taxonomy edges. Model input format: - " ## " - - If no `context_json_path` is provided, the class precomputes a - context file ({ontology_name}_processed.json) directly from the ontology - object. + " ## " + + Context building: + If no `context_json_path` is provided, the learner precomputes a fixed-name + context file `rwthdbis_onto_processed.json` under `output_dir/context/` + from the ontology terms and stores the path in `self.context_json_path`. + + Attributes: + model_name: Hugging Face model identifier. + output_dir: Directory where checkpoints and tokenizer are saved/loaded. + min_predictions: If no candidate is predicted positive, return the top-k + by positive probability (k = min_predictions). + max_length: Maximum tokenized length for inputs. + per_device_train_batch_size: Micro-batch size per device. + gradient_accumulation_steps: Gradient accumulation steps. + num_train_epochs: Number of training epochs. + learning_rate: Optimizer LR. + weight_decay: Weight decay for AdamW. + logging_steps: Logging interval for Trainer. + save_strategy: HF saving strategy (e.g., 'epoch'). + save_total_limit: Max checkpoints to keep. + fp16: Enable FP16 mixed precision. + bf16: Enable BF16 mixed precision (on supported hardware). + seed: Random seed for reproducibility. + negative_ratio: Number of negatives per positive during training. + bidirectional_templates: If True, also add reversed template examples. + context_json_path: Path to the preprocessed term-context JSON. If None, + the file is generated with the fixed prefix `rwthdbis_onto_*`. + ontology_name: Logical dataset/domain label used in prompts and filtering + (filenames still use the fixed `rwthdbis_onto_*` prefix). + device: user-defined argument as 'cuda' or 'cpu'. + model: Loaded/initialized `AutoModelForSequenceClassification`. + tokenizer: Loaded/initialized `AutoTokenizer`. """ # Sentences containing any of these phrases are pruned from term_info. @@ -78,7 +105,8 @@ def __init__( self, min_predictions: int = 1, model_name: str = "distilroberta-base", - output_dir: str = "./results/{model_name}", + output_dir: str = "./results/taxonomy-discovery", + device: str = "cpu", max_length: int = 256, per_device_train_batch_size: int = 8, gradient_accumulation_steps: int = 4, @@ -94,56 +122,176 @@ def __init__( negative_ratio: int = 5, bidirectional_templates: bool = True, context_json_path: Optional[str] = None, - ontology_name: str = "Geonames" + ontology_name: str = "Geonames", ) -> None: + """ + Initialize the taxonomy-edge learner and set training/inference knobs. + + Notes: + - Output artifacts are written under `output_dir`, including + the model weights and tokenizer (for later `from_pretrained` loads). + - If `context_json_path` is not provided, a new context file named + `rwthdbis_onto_processed.json` is generated under `output_dir/context/`. + """ super().__init__() self.model_name = model_name - self.safe_model_name = model_name.replace("/", "__") + safe_model_name = model_name.replace("/", "__") - resolved_output = output_dir.format(model_name=self.safe_model_name) + resolved_output = output_dir.format(model_name=safe_model_name) self.output_dir = str(Path(resolved_output)) Path(self.output_dir).mkdir(parents=True, exist_ok=True) - self.min_predictions = int(min_predictions) - self.max_length = int(max_length) - self.per_device_train_batch_size = int(per_device_train_batch_size) - self.gradient_accumulation_steps = int(gradient_accumulation_steps) - self.num_train_epochs = float(num_train_epochs) - self.learning_rate = float(learning_rate) - self.weight_decay = float(weight_decay) - self.logging_steps = int(logging_steps) - self.save_strategy = str(save_strategy) - self.save_total_limit = int(save_total_limit) - self.fp16 = bool(fp16) - self.bf16 = bool(bf16) - self.seed = int(seed) - - self.negative_ratio = int(negative_ratio) - self.bidirectional_templates = bool(bidirectional_templates) + # Store provided argument values as-is (types are enforced by callers). + self.min_predictions = min_predictions + self.max_length = max_length + self.per_device_train_batch_size = per_device_train_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.num_train_epochs = num_train_epochs + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.logging_steps = logging_steps + self.save_strategy = save_strategy + self.save_total_limit = save_total_limit + self.fp16 = fp16 + self.bf16 = bf16 + self.seed = seed + + self.negative_ratio = negative_ratio + self.bidirectional_templates = bidirectional_templates self.context_json_path = context_json_path self.ontology_name = ontology_name - self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device self.model: Optional[AutoModelForSequenceClassification] = None self.tokenizer: Optional[AutoTokenizer] = None - os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") - os.environ.setdefault("WANDB_DISABLED", "true") - os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") + # Context caches built from the context JSON. + self._context_exact: Dict[str, str] = {} # lower(term) -> info + self._context_rows: List[ + Dict[str, str] + ] = [] # [{'term': str, 'term_info': str}, ...] + + def _is_windows(self) -> bool: + """Return True if the current OS is Windows (NT).""" + return (os.name == "nt") or (platform.system().lower() == "windows") + + def _normalize_text(self, raw_text: str, *, drop_questions: bool = False) -> str: + """ + Normalize plain text consistently across the pipeline. + + Operations: + - Remove markdown-like link patterns (e.g., '[[1]](http://...)'). + - Replace newlines with spaces; collapse repeated spaces. + - Optionally drop sentences containing '?' (useful for model generations). + + Args: + raw_text: Input text to normalize. + drop_questions: If True, filter out sentences with '?'. + + Returns: + str: Cleaned single-line string. + """ + if raw_text is None: + return "" + text = str(raw_text) + + # Remove simple markdown link artifacts like [[1]](http://...) + text = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", text) + + # Replace newlines with spaces and collapse multiple spaces + text = text.replace("\n", " ") + text = re.sub(r"\s{2,}", " ", text) + + if drop_questions: + sentences = [s.strip() for s in text.split(".")] + sentences = [s for s in sentences if s and "?" not in s] + text = ". ".join(sentences) + + return text.strip() + + def _default_gpt_inference_with_dataset(self, term: str, dataset_name: str) -> str: + """ + Generate a plain-text description for `term`, conditioned on `dataset_name`, + via g4f (best-effort). Falls back to an empty string on failure. + + The raw output is then normalized with `_normalize_text(drop_questions=True)`. + + Args: + term: Term to describe. + dataset_name: Ontology/domain name used in the prompt. + + Returns: + str: Cleaned paragraph describing the term, or "" on failure. + """ + prompt = ( + f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " + "Provide as detailed a definition of this term as possible in plain text.without any markdown format." + "No reference link in result. " + "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" + "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" + "Output: Plain text paragraphs only, neutral and factual." + f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." + ) + + try: + client = _G4FClient() + response = client.chat.completions.create( + model=g4f.models.default, + messages=[{"role": "user", "content": prompt}], + ) + raw_text = ( + response.choices[0].message.content + if response and response.choices + else "" + ) + except Exception: + raw_text = "" # best-effort fallback - self._context_exact: Dict[str, str] = {} # lower(term) -> info - self._context_rows: List[Dict[str, str]] = [] # [{'term': str, 'term_info': str}, ...] + return self._normalize_text(raw_text, drop_questions=True) def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + """ + AutoLearner hook: route to training or prediction. + + Args: + data: Ontology-like object (has `.taxonomies` or `.type_taxonomies.taxonomies`). + test: If True, run inference; otherwise, train a model. + + Returns: + If test=True, a list of accepted edges as dicts with keys `parent` and `child`; + otherwise None. + """ return self._predict_pairs(data) if test else self._train_from_pairs(data) def _train_from_pairs(self, train_data: Any) -> None: + """ + Train a binary classifier from ontology pairs. + + Steps: + 1) (Re)build the term-context JSON unless `context_json_path` is set. + 2) Extract positive (parent, child) edges from `train_data`. + 3) Sample negatives at `negative_ratio`. + 4) Tokenize, instantiate HF Trainer, train, and save. + + Args: + train_data: Ontology-like object with `.type_taxonomies.taxonomies` + (preferred) or `.taxonomies`, each item providing `parent` and `child`. + + Raises: + ValueError: If no positive pairs are found. + + Side Effects: + - Writes a trained model to `self.output_dir` (via `trainer.save_model`). + - Writes the tokenizer to `self.output_dir` (via `save_pretrained`). + - Sets `self.context_json_path` if it was previously unset. + The generated context file is named `rwthdbis_onto_processed.json`. + """ # Always (re)build context from ontology unless an explicit file is provided if not self.context_json_path: context_dir = Path(self.output_dir) / "context" context_dir.mkdir(parents=True, exist_ok=True) - processed_context_file = context_dir / f"{self.ontology_name}_processed.json" + processed_context_file = context_dir / "rwthdbis_onto_processed.json" # Remove stale file then regenerate if processed_context_file.exists(): @@ -157,10 +305,12 @@ def _train_from_pairs(self, train_data: Any) -> None: processed_dir=context_dir, dataset_name=self.ontology_name, num_workers=max(1, min(os.cpu_count() or 2, 4)), - provider=partial(self._default_gpt_inference_with_dataset, dataset_name=self.ontology_name), + provider=partial( + self._default_gpt_inference_with_dataset, + dataset_name=self.ontology_name, + ), max_retries=5, ) - self.context_json_path = str(processed_context_file) # Reproducibility @@ -175,19 +325,23 @@ def _train_from_pairs(self, train_data: Any) -> None: if not positive_pairs: raise ValueError("No positive (parent, child) pairs found in train_data.") - entity_names = sorted({parent for parent, _ in positive_pairs} | {child for _, child in positive_pairs}) + entity_names = sorted( + {parent for parent, _ in positive_pairs} + | {child for _, child in positive_pairs} + ) negative_pairs = self._generate_negatives( positives=positive_pairs, entities=entity_names, ratio=self.negative_ratio, ) - labels, texts = self._build_text_dataset(positive_pairs, negative_pairs) - - - datasets = DatasetDict({"train": Dataset.from_dict({"label": labels, "text": texts})}) + labels, input_texts = self._build_text_dataset(positive_pairs, negative_pairs) + dataset_dict = DatasetDict( + {"train": Dataset.from_dict({"label": labels, "text": input_texts})} + ) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + # Ensure a pad token exists for robust padding across models. if self.tokenizer.pad_token is None: self.tokenizer.pad_token = ( getattr(self.tokenizer, "eos_token", None) @@ -196,10 +350,15 @@ def _train_from_pairs(self, train_data: Any) -> None: ) def tokenize_batch(batch: Dict[str, List[str]]): - return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + """Tokenize a batch of input texts for HF Datasets mapping.""" + return self.tokenizer( + batch["text"], truncation=True, max_length=self.max_length + ) - tokenized = datasets.map(tokenize_batch, batched=True, remove_columns=["text"]) - collator = DataCollatorWithPadding(self.tokenizer) + tokenized_dataset = dataset_dict.map( + tokenize_batch, batched=True, remove_columns=["text"] + ) + data_collator = DataCollatorWithPadding(self.tokenizer) self.model = AutoModelForSequenceClassification.from_pretrained( self.model_name, @@ -207,10 +366,14 @@ def tokenize_batch(batch: Dict[str, List[str]]): id2label={0: "incorrect", 1: "correct"}, label2id={"incorrect": 0, "correct": 1}, ) - if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + # Ensure model has a pad_token_id if tokenizer provides one. + if ( + getattr(self.model.config, "pad_token_id", None) is None + and self.tokenizer.pad_token_id is not None + ): self.model.config.pad_token_id = self.tokenizer.pad_token_id - train_args = TrainingArguments( + training_args = TrainingArguments( output_dir=self.output_dir, learning_rate=self.learning_rate, per_device_train_batch_size=self.per_device_train_batch_size, @@ -220,7 +383,7 @@ def tokenize_batch(batch: Dict[str, List[str]]): save_strategy=self.save_strategy, save_total_limit=self.save_total_limit, logging_steps=self.logging_steps, - dataloader_pin_memory = bool(torch.cuda.is_available()), + dataloader_pin_memory=bool(torch.cuda.is_available()), fp16=self.fp16, bf16=self.bf16, report_to="none", @@ -229,16 +392,30 @@ def tokenize_batch(batch: Dict[str, List[str]]): trainer = Trainer( model=self.model, - args=train_args, - train_dataset=tokenized["train"], + args=training_args, + train_dataset=tokenized_dataset["train"], tokenizer=self.tokenizer, - data_collator=collator, + data_collator=data_collator, ) trainer.train() - trainer.save_model(self.output_dir) + trainer.save_model() + # Persist tokenizer alongside the model for from_pretrained() loads. self.tokenizer.save_pretrained(self.output_dir) def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: + """ + Score candidate pairs and return those predicted as positive. + + If no pair is predicted positive but `min_predictions` > 0, the top-k + pairs by positive probability are returned. + + Args: + eval_data: Ontology-like object with either `.pairs` (preferred) or + `.type_taxonomies.taxonomies` / `.taxonomies`. + + Returns: + list[dict]: Each dict has keys `parent` and `child`. + """ import torch.nn.functional as F self._ensure_loaded_for_inference() @@ -247,55 +424,90 @@ def _predict_pairs(self, eval_data: Any) -> List[Dict[str, str]]: if not candidate_pairs: return [] - accepted: List[Dict[str, str]] = [] + accepted_pairs: List[Dict[str, str]] = [] scored_candidates: List[Tuple[float, str, str, int]] = [] self.model.eval() with torch.no_grad(): for parent_term, child_term in candidate_pairs: input_text = self._format_input(parent_term, child_term) - inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, max_length=self.max_length) - inputs = {k: v.to(self.device) for k, v in inputs.items()} + inputs = self.tokenizer( + input_text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) + inputs = {key: tensor.to(self.device) for key, tensor in inputs.items()} logits = self.model(**inputs).logits - probs = F.softmax(logits, dim=-1).squeeze(0) - p_positive = float(probs[1].item()) + probabilities = F.softmax(logits, dim=-1).squeeze(0) + p_positive = float(probabilities[1].item()) predicted_label = int(torch.argmax(logits, dim=-1).item()) - scored_candidates.append((p_positive, parent_term, child_term, predicted_label)) + scored_candidates.append( + (p_positive, parent_term, child_term, predicted_label) + ) if predicted_label == 1: - accepted.append({"parent": parent_term, "child": child_term}) + accepted_pairs.append({"parent": parent_term, "child": child_term}) - if accepted: - return accepted + if accepted_pairs: + return accepted_pairs top_k = max(0, int(self.min_predictions)) if top_k == 0: return [] scored_candidates.sort(key=lambda item: item[0], reverse=True) - return [{"parent": parent_term, "child": child_term} - for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k]] + return [ + {"parent": parent_term, "child": child_term} + for (_prob, parent_term, child_term, _pred) in scored_candidates[:top_k] + ] def _ensure_loaded_for_inference(self) -> None: + """ + Load model and tokenizer from `self.output_dir` if not already loaded. + + Side Effects: + - Sets `self.model` and `self.tokenizer`. + - Moves the model to `self.device`. + - Ensures `tokenizer.pad_token_id` is set if model config provides one. + """ if self.model is not None and self.tokenizer is not None: return - self.model = AutoModelForSequenceClassification.from_pretrained(self.output_dir).to(self.device) + self.model = AutoModelForSequenceClassification.from_pretrained( + self.output_dir + ).to(self.device) self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir) - if self.tokenizer.pad_token_id is None and getattr(self.model.config, "pad_token_id", None) is not None: + if ( + self.tokenizer.pad_token_id is None + and getattr(self.model.config, "pad_token_id", None) is not None + ): self.tokenizer.pad_token_id = self.model.config.pad_token_id def _load_context_map(self) -> None: - """Build exact and fuzzy maps from {ontology_name}_processed.json.""" - if not (self.context_json_path): + """ + Populate in-memory maps from the context JSON (`self.context_json_path`). + + Builds: + - `_context_exact`: dict mapping lowercased term → term_info. + - `_context_rows`: list of dict rows with 'term' and 'term_info'. + + If `context_json_path` is falsy or loading fails, both structures become empty. + """ + if not self.context_json_path: self._context_exact = {} self._context_rows = [] return try: rows = json.load(open(self.context_json_path, "r", encoding="utf-8")) self._context_exact = { - str(row.get("term", "")).strip().lower(): str(row.get("term_info", "")).strip() + str(row.get("term", "")).strip().lower(): str( + row.get("term_info", "") + ).strip() for row in rows } self._context_rows = [ - {"term": str(row.get("term", "")), "term_info": str(row.get("term_info", ""))} + { + "term": str(row.get("term", "")), + "term_info": str(row.get("term_info", "")), + } for row in rows ] except Exception: @@ -304,8 +516,17 @@ def _load_context_map(self) -> None: def _lookup_context_info(self, raw_term: str) -> str: """ - Loose context lookup: split by commas, strip whitespace, case-insensitive - substring match against any row['term']. Join hits with '.'. + Retrieve textual context for a term using exact and simple fuzzy matching. + + - Exact: lowercased term lookup in `_context_exact`. + - Fuzzy: split `raw_term` by commas, strip whitespace; treat each piece + as a case-insensitive substring against row['term']. + + Args: + raw_term: Original term string (possibly comma-separated). + + Returns: + str: Concatenated matches' term_info ('.' joined). Empty string if none. """ if not raw_term: return "" @@ -329,27 +550,62 @@ def _lookup_context_info(self, raw_term: str) -> str: def _extract_positive_pairs(self, ontology_obj: Any) -> List[Tuple[str, str]]: """ - Read pairs from ontology_obj.type_taxonomies.taxonomies (or fallback to .taxonomies). - Each item must provide 'parent' and 'child' attributes/keys. + Extract positive (parent, child) edges from an ontology-like object. + + Reads from `ontology_obj.type_taxonomies.taxonomies` (preferred) or + falls back to `ontology_obj.taxonomies`. Each item must expose `parent` + and `child` as attributes or dict keys. + + Returns: + list[tuple[str, str]]: (parent, child) pairs (may be empty). """ type_taxonomies = getattr(ontology_obj, "type_taxonomies", None) - items = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology_obj, "taxonomies", None) + items = ( + getattr(type_taxonomies, "taxonomies", None) + if type_taxonomies is not None + else getattr(ontology_obj, "taxonomies", None) + ) pairs: List[Tuple[str, str]] = [] if items: for item in items: - parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") - child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + parent_term = ( + getattr(item, "parent", None) + if not isinstance(item, dict) + else item.get("parent") + ) + child_term = ( + getattr(item, "child", None) + if not isinstance(item, dict) + else item.get("child") + ) if parent_term and child_term: pairs.append((str(parent_term), str(child_term))) return pairs def _extract_pairs_for_eval(self, ontology_obj: Any) -> List[Tuple[str, str]]: + """ + Extract candidate pairs for evaluation. + + Prefers `ontology_obj.pairs` if present; otherwise falls back to the + positive pairs from the ontology (see `_extract_positive_pairs`). + + Returns: + list[tuple[str, str]]: Candidate (parent, child) pairs. + """ candidate_pairs = getattr(ontology_obj, "pairs", None) if candidate_pairs: pairs: List[Tuple[str, str]] = [] for item in candidate_pairs: - parent_term = getattr(item, "parent", None) if not isinstance(item, dict) else item.get("parent") - child_term = getattr(item, "child", None) if not isinstance(item, dict) else item.get("child") + parent_term = ( + getattr(item, "parent", None) + if not isinstance(item, dict) + else item.get("parent") + ) + child_term = ( + getattr(item, "child", None) + if not isinstance(item, dict) + else item.get("child") + ) if parent_term and child_term: pairs.append((str(parent_term), str(child_term))) return pairs @@ -361,29 +617,66 @@ def _generate_negatives( entities: List[str], ratio: int, ) -> List[Tuple[str, str]]: + """ + Sample negative edges by excluding known positives and self-pairs. + + Constructs the cartesian product of entities (excluding (x, x)), + removes all known positives, and samples up to `ratio * len(positives)` + negatives uniformly at random. + + Args: + positives: Known positive edges. + entities: Unique set/list of entity terms. + ratio: Target negatives per positive (lower-bounded by 1×). + + Returns: + list[tuple[str, str]]: Sampled negative pairs (may be smaller). + """ positive_set = set(positives) - all_possible = {(parent_term, child_term) for parent_term in entities for child_term in entities if parent_term != child_term} + all_possible = { + (parent, child) + for parent in entities + for child in entities + if parent != child + } negative_candidates = list(all_possible - positive_set) target_count = max(len(positive_set) * max(1, ratio), len(positive_set)) sample_count = min(target_count, len(negative_candidates)) - return random.sample(negative_candidates, k=sample_count) if sample_count > 0 else [] + return ( + random.sample(negative_candidates, k=sample_count) + if sample_count > 0 + else [] + ) def _build_text_dataset( self, positives: List[Tuple[str, str]], negatives: List[Tuple[str, str]], ) -> Tuple[List[int], List[str]]: + """ + Create parallel lists of labels and input texts for HF Datasets. + + Builds formatted inputs using `_format_input`, and duplicates examples in + the reverse direction if `bidirectional_templates` is True. + + Returns: + tuple[list[int], list[str]]: (labels, input_texts) where labels are + 1 for positive and 0 for negative. + """ self._load_context_map() labels: List[int] = [] input_texts: List[str] = [] def add_example(parent_term: str, child_term: str, label_value: int) -> None: + """Append one (and optionally reversed) example to the dataset.""" input_texts.append(self._format_input(parent_term, child_term)) labels.append(label_value) if self.bidirectional_templates: - input_texts.append(self._format_input(child_term, parent_term, reverse=True)) + input_texts.append( + self._format_input(child_term, parent_term, reverse=True) + ) labels.append(label_value) for parent_term, child_term in positives: @@ -393,7 +686,15 @@ def add_example(parent_term: str, child_term: str, label_value: int) -> None: return labels, input_texts - def _format_input(self, parent_term: str, child_term: str, reverse: bool = False) -> str: + def _format_input( + self, parent_term: str, child_term: str, reverse: bool = False + ) -> str: + """ + Format a (parent, child) pair into relation text + optional context. + + Returns: + str: " [## Context. 'parent': ... 'child': ...]" + """ relation_text = ( f"{child_term} is a subclass / child / subtype / descendant class of {parent_term}" if reverse @@ -405,63 +706,70 @@ def _format_input(self, parent_term: str, child_term: str, reverse: bool = False if not parent_info and not child_info: return relation_text - context_text = f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + context_text = ( + f"## Context. '{parent_term}': {parent_info} '{child_term}': {child_info}" + ) return f"{relation_text} {context_text}" - @staticmethod - def _is_windows() -> bool: - return (os.name == "nt") or (platform.system().lower() == "windows") - - @staticmethod - def _default_gpt_inference_with_dataset(term: str, dataset_name: str) -> str: - """ - Generate a plain-text description for `term`, tailored by `dataset_name`. - Uses g4f if available; otherwise returns an empty string. + def _fill_bucket_threaded( + self, bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str] + ) -> None: """ - prompt = ( - f"Here is a: {term}, which is of domain name :{dataset_name}, translate it into english, " - "Provide as detailed a definition of this term as possible in plain text.without any markdown format." - "No reference link in result. " - "- Focus on intrinsic properties; do not name other entities or explicit relationships.\n" - "- Include classification/type, defining features, scope/scale, roles/functions, and measurable attributes when applicable.\n" - "Output: Plain text paragraphs only, neutral and factual." - f"Make sure all provided information can be used for discovering implicit relation of other {dataset_name} term, but don't mention the relation in result." - ) + Populate a shard with provider-generated `term_info` using threads. + Resumes from `output_path` if it already exists, periodically writes + progress (every ~10 items), and finally dumps the full bucket to disk. + """ + start_index = 0 try: - client = _G4FClient() - response = client.chat.completions.create( - model=g4f.models.default, - messages=[{"role": "user", "content": prompt}], - ) - raw_text = response.choices[0].message.content if response and response.choices else "" + if output_path.is_file(): + existing_rows = json.load(open(output_path, "r", encoding="utf-8")) + if isinstance(existing_rows, list) and existing_rows: + bucket_rows[: len(existing_rows)] = existing_rows + start_index = len(existing_rows) except Exception: - raw_text = "" # or some deterministic fallback - - # Clean up - cleaned = re.sub(r"[\*\-\#]", "", raw_text) - cleaned = re.sub(r"\n\s*\n", " ", cleaned) - cleaned = cleaned.replace("\n", " ") - cleaned = re.sub(r"\s{2,}", " ", cleaned) - cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", cleaned) - sentences = [sentence for sentence in cleaned.split(".") if "?" not in sentence] - return ".".join(sentences).strip() - - @staticmethod - def _clean_term_info(raw_text: str) -> str: - """Normalize whitespace and remove link artifacts.""" - cleaned = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", str(raw_text)) - cleaned = re.sub(r"\s+", " ", cleaned).strip() - return cleaned - - @classmethod - def _merge_part_files(cls, dataset_name: str, merged_path: Path, part_paths: List[Path]) -> None: + pass + + for row_index in range(start_index, len(bucket_rows)): + try: + bucket_rows[row_index]["term_info"] = provider( + bucket_rows[row_index]["term"] + ) + except Exception: + bucket_rows[row_index]["term_info"] = "" + if row_index % 10 == 1: + json.dump( + bucket_rows[: row_index + 1], + open(output_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=2, + ) + + json.dump( + bucket_rows, + open(output_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=2, + ) + + def _merge_part_files( + self, dataset_name: str, merged_path: Path, shard_paths: List[Path] + ) -> None: + """ + Merge shard files into one JSON and filter boilerplate sentences. + + - Reads shard lists/dicts from `shard_paths`. + - Drops sentences that contain markers in `_CONTEXT_REMOVALS` or the + `dataset_name` string. + - Normalizes the remaining text via `_normalize_text`. + - Writes merged JSON to `merged_path`, then best-effort deletes shards. + """ merged_rows: List[dict] = [] - for part_path in part_paths: + for shard_path in shard_paths: try: - if not part_path.is_file(): + if not shard_path.is_file(): continue - part_content = json.load(open(part_path, "r", encoding="utf-8")) + part_content = json.load(open(shard_path, "r", encoding="utf-8")) if isinstance(part_content, list): merged_rows.extend(part_content) elif isinstance(part_content, dict): @@ -469,165 +777,111 @@ def _merge_part_files(cls, dataset_name: str, merged_path: Path, part_paths: Lis except Exception: continue - removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + removal_markers = list(self._CONTEXT_REMOVALS) + [dataset_name] for row in merged_rows: term_info_raw = str(row.get("term_info", "")) kept_sentences: List[str] = [] for sentence in term_info_raw.split("."): - sentence_no_links = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence) + sentence_no_links = re.sub( + r"\[\[\d+\]\]\(https?://[^\)]+\)", "", sentence + ) if any(marker in sentence_no_links for marker in removal_markers): continue kept_sentences.append(sentence_no_links) - row["term_info"] = cls._clean_term_info(".".join(kept_sentences)) + row["term_info"] = self._normalize_text( + ".".join(kept_sentences), drop_questions=False + ) merged_path.parent.mkdir(parents=True, exist_ok=True) - json.dump(merged_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + json.dump( + merged_rows, + open(merged_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=4, + ) # best-effort cleanup - for part_path in part_paths: + for shard_path in shard_paths: try: - os.remove(part_path) + os.remove(shard_path) except Exception: pass - @staticmethod - def _fill_bucket_threaded(bucket_rows: List[dict], output_path: Path, provider: Callable[[str], str]) -> None: - start_index = 0 - try: - if output_path.is_file(): - existing_rows = json.load(open(output_path, "r", encoding="utf-8")) - if isinstance(existing_rows, list) and existing_rows: - bucket_rows[: len(existing_rows)] = existing_rows - start_index = len(existing_rows) - except Exception: - pass - - for row_index in range(start_index, len(bucket_rows)): - try: - bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) - except Exception: - bucket_rows[row_index]["term_info"] = "" - if row_index % 10 == 1: - json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - - json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - - @staticmethod - def _fill_bucket_process( - worker_id: int, - bucket_rows: List[dict], - output_path: Path, - provider: Callable[[str], str], - progress_map: "multiprocessing.managers.DictProxy", - ) -> None: - current_index = 0 - try: - if output_path.is_file(): - existing_rows = json.load(open(output_path, "r", encoding="utf-8")) - if isinstance(existing_rows, list) and existing_rows: - bucket_rows[: len(existing_rows)] = existing_rows - current_index = len(existing_rows) - except Exception: - pass - - progress_map[worker_id] = current_index - - for row_index in range(current_index, len(bucket_rows)): - try: - bucket_rows[row_index]["term_info"] = provider(bucket_rows[row_index]["term"]) - except Exception: - bucket_rows[row_index]["term_info"] = "" - progress_map[worker_id] = row_index + 1 - if row_index % 10 == 1: - json.dump(bucket_rows[: row_index + 1], open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - - json.dump(bucket_rows, open(output_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) - progress_map[worker_id] = len(bucket_rows) - - @classmethod def _execute_for_terms( - cls, + self, terms: List[str], merged_path: Path, - part_paths: List[Path], + shard_paths: List[Path], provider: Callable[[str], str], dataset_name: str, num_workers: int = 2, ) -> None: """ - Generate context for `terms`, writing shards to `part_paths`, then merge. - Threads on Windows; processes on POSIX. + Generate context for `terms`, writing shards to `shard_paths`, then merge. + + Always uses threads (pickling-safe for instance methods). + Shows a tqdm progress bar and merges shards at the end. """ worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) - all_rows = [{"id": row_index, "term": term, "term_info": ""} for row_index, term in enumerate(terms)] + all_rows = [ + {"id": index, "term": term, "term_info": ""} + for index, term in enumerate(terms) + ] buckets: List[List[dict]] = [[] for _ in range(worker_count)] for reversed_index, row in enumerate(reversed(all_rows)): buckets[reversed_index % worker_count].append(row) - if cls._is_windows(): - total_rows = len(terms) - progress_bar = tqdm(total=total_rows, desc=f"{dataset_name} generation (threads)") - - def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: - cls._fill_bucket_threaded(bucket_rows, out_path, provider) - return len(bucket_rows) - - with ThreadPoolExecutor(max_workers=worker_count) as pool: - futures = [pool.submit(run_bucket, buckets[bucket_index], part_paths[bucket_index]) - for bucket_index in range(worker_count)] - for future in as_completed(futures): - completed_count = future.result() - if progress_bar: - progress_bar.update(completed_count) - if progress_bar: - progress_bar.close() - else: - manager = multiprocessing.Manager() - progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) - - processes: List[multiprocessing.Process] = [] - for worker_index, bucket_rows in enumerate(buckets): - process = multiprocessing.Process( - target=cls._fill_bucket_process, - args=(worker_index, bucket_rows, part_paths[worker_index], provider, progress_map), + total_rows = len(terms) + progress_bar = tqdm( + total=total_rows, desc=f"{dataset_name} generation (threads)" + ) + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + self._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [ + pool.submit( + run_bucket, buckets[bucket_index], shard_paths[bucket_index] ) - processes.append(process) - process.start() - - total_rows = len(terms) - with tqdm(total=total_rows, desc=f"{dataset_name} generation") as progress_bar: - previous_total = 0 - while any(process.is_alive() for process in processes): - current_total = int(sum(progress_map.values())) - progress_bar.update(current_total - previous_total) - previous_total = current_total - time.sleep(0.5) - current_total = int(sum(progress_map.values())) - if current_total > previous_total: - progress_bar.update(current_total - previous_total) - - for process in processes: - process.join() - - cls._merge_part_files(dataset_name, merged_path, part_paths) - - @classmethod + for bucket_index in range(worker_count) + ] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + + self._merge_part_files(dataset_name, merged_path, shard_paths) + def _re_infer_short_entries( - cls, + self, merged_path: Path, - re_part_paths: List[Path], + re_shard_paths: List[Path], re_merged_path: Path, provider: Callable[[str], str], dataset_name: str, num_workers: int, ) -> int: """ - Re-query terms with too-short term_info (< 50 chars). Returns remaining count. + Re-query terms whose `term_info` is too short (< 50 chars). + + Process: + - Read `merged_path`. + - Filter boilerplate using `_CONTEXT_REMOVALS` and `dataset_name`. + - Split into short/long groups by length 50. + - Regenerate short group with `provider` in parallel (threads). + - Merge regenerated + long back into `merged_path`. + + Returns: + int: Count of rows still < 50 chars after re-inference. """ merged_rows = json.load(open(merged_path, "r", encoding="utf-8")) - removal_markers = list(cls._CONTEXT_REMOVALS) + [dataset_name] + removal_markers = list(self._CONTEXT_REMOVALS) + [dataset_name] short_rows: List[dict] = [] long_rows: List[dict] = [] @@ -635,9 +889,14 @@ def _re_infer_short_entries( term_info_raw = str(row.get("term_info", "")) sentences = term_info_raw.split(".") for marker in removal_markers: - sentences = [sentence if marker not in sentence else "" for sentence in sentences] - filtered_info = re.sub(r"\[\[\d+\]\]\(https?://[^\)]+\)", "", ".".join(sentences)) + sentences = [ + sentence if marker not in sentence else "" for sentence in sentences + ] + filtered_info = self._normalize_text( + ".".join(sentences), drop_questions=False + ) row["term_info"] = filtered_info + (short_rows if len(filtered_info) < 50 else long_rows).append(row) worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) @@ -645,77 +904,83 @@ def _re_infer_short_entries( for row_index, row in enumerate(short_rows): buckets[row_index % worker_count].append(row) - # clean old re-inference shards - for path in re_part_paths: + # Clean old re-inference shards + for path in re_shard_paths: try: os.remove(path) except Exception: pass total_candidates = len(short_rows) - if cls._is_windows(): - progress_bar = tqdm(total=total_candidates, desc=f"{dataset_name} re-inference (threads)") - - def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: - cls._fill_bucket_threaded(bucket_rows, out_path, provider) - return len(bucket_rows) - - with ThreadPoolExecutor(max_workers=worker_count) as pool: - futures = [pool.submit(run_bucket, buckets[bucket_index], re_part_paths[bucket_index]) - for bucket_index in range(worker_count)] - for future in as_completed(futures): - completed_count = future.result() - if progress_bar: - progress_bar.update(completed_count) - if progress_bar: - progress_bar.close() - else: - manager = multiprocessing.Manager() - progress_map = manager.dict({worker_index: 0 for worker_index in range(worker_count)}) - - processes: List[multiprocessing.Process] = [] - for worker_index, bucket_rows in enumerate(buckets): - process = multiprocessing.Process( - target=cls._fill_bucket_process, - args=(worker_index, bucket_rows, re_part_paths[worker_index], provider, progress_map), + progress_bar = tqdm( + total=total_candidates, desc=f"{dataset_name} re-inference (threads)" + ) + + def run_bucket(bucket_rows: List[dict], out_path: Path) -> int: + self._fill_bucket_threaded(bucket_rows, out_path, provider) + return len(bucket_rows) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + futures = [ + pool.submit( + run_bucket, buckets[bucket_index], re_shard_paths[bucket_index] ) - processes.append(process) - process.start() - - with tqdm(total=total_candidates, desc=f"{dataset_name} re-inference") as progress_bar: - previous_total = 0 - while any(process.is_alive() for process in processes): - current_total = int(sum(progress_map.values())) - progress_bar.update(current_total - previous_total) - previous_total = current_total - time.sleep(1) - if progress_bar.n < total_candidates: - progress_bar.update(total_candidates - progress_bar.n) - - for process in processes: - process.join() - - # merge and write back - cls._merge_part_files(dataset_name, re_merged_path, re_part_paths) - new_rows = json.load(open(re_merged_path, "r", encoding="utf-8")) if re_merged_path.is_file() else [] + for bucket_index in range(worker_count) + ] + for future in as_completed(futures): + completed_count = future.result() + if progress_bar: + progress_bar.update(completed_count) + if progress_bar: + progress_bar.close() + + # Merge and write back + self._merge_part_files(dataset_name, re_merged_path, re_shard_paths) + new_rows = ( + json.load(open(re_merged_path, "r", encoding="utf-8")) + if re_merged_path.is_file() + else [] + ) final_rows = long_rows + new_rows - json.dump(final_rows, open(merged_path, "w", encoding="utf-8"), ensure_ascii=False, indent=4) + json.dump( + final_rows, + open(merged_path, "w", encoding="utf-8"), + ensure_ascii=False, + indent=4, + ) - remaining_short = sum(1 for row in final_rows if len(str(row.get("term_info", ""))) < 50) + remaining_short = sum( + 1 for row in final_rows if len(str(row.get("term_info", ""))) < 50 + ) return remaining_short - @staticmethod - def _extract_terms_from_ontology(ontology: Any) -> List[str]: + def _extract_terms_from_ontology(self, ontology: Any) -> List[str]: """ - Collect unique term names from ontology.type_taxonomies.taxonomies. + Collect unique term names from `ontology.type_taxonomies.taxonomies`, + falling back to `ontology.taxonomies` if needed. + + Returns: + list[str]: Sorted unique term list. """ type_taxonomies = getattr(ontology, "type_taxonomies", None) - taxonomies = getattr(type_taxonomies, "taxonomies", None) if type_taxonomies is not None else getattr(ontology, "taxonomies", None) + taxonomies = ( + getattr(type_taxonomies, "taxonomies", None) + if type_taxonomies is not None + else getattr(ontology, "taxonomies", None) + ) unique_terms: set[str] = set() if taxonomies: for row in taxonomies: - parent_term = getattr(row, "parent", None) if not isinstance(row, dict) else row.get("parent") - child_term = getattr(row, "child", None) if not isinstance(row, dict) else row.get("child") + parent_term = ( + getattr(row, "parent", None) + if not isinstance(row, dict) + else row.get("parent") + ) + child_term = ( + getattr(row, "child", None) + if not isinstance(row, dict) + else row.get("child") + ) if parent_term: unique_terms.add(str(parent_term)) if child_term: @@ -732,15 +997,32 @@ def preprocess_context_from_ontology( max_retries: int = 5, ) -> Path: """ - Build {id, term, term_info} from an ontology object. - Always regenerates {dataset_name}_processed.json. + Build `{id, term, term_info}` rows from an ontology object. + + Always regenerates the fixed-name file `rwthdbis_onto_processed.json`, + performing: + - Parallel generation of term_info in shards (`_execute_for_terms`), + - Re-inference rounds for short entries (`_re_infer_short_entries`), + - Final merge and cleanup, + - Updates `self.context_json_path`. + + Filenames under `processed_dir`: + - merged: `rwthdbis_onto_processed.json` + - shards: `rwthdbis_onto_type_part{idx}.json` + - re-infer shards: `rwthdbis_onto_re_inference{idx}.json` + - re-infer merged: `rwthdbis_onto_Types_re_inference.json` + + Returns: + Path: The merged context JSON path (`rwthdbis_onto_processed.json`). """ - provider = provider or provider or partial(self._default_gpt_inference_with_dataset, dataset_name=dataset_name) + provider = provider or partial( + self._default_gpt_inference_with_dataset, dataset_name=dataset_name + ) processed_dir = Path(processed_dir) processed_dir.mkdir(parents=True, exist_ok=True) - merged_path = processed_dir / f"{dataset_name}_processed.json" + merged_path = processed_dir / "rwthdbis_onto_processed.json" if merged_path.exists(): try: merged_path.unlink() @@ -748,12 +1030,18 @@ def preprocess_context_from_ontology( pass worker_count = max(1, min(num_workers, os.cpu_count() or 2, 4)) - shard_paths = [processed_dir / f"{dataset_name}_type_part{shard_index}.json" for shard_index in range(worker_count)] - reinf_paths = [processed_dir / f"{dataset_name}_re_inference{shard_index}.json" for shard_index in range(worker_count)] - reinf_merged_path = processed_dir / f"{dataset_name}_Types_re_inference.json" - - # remove any leftover shards - for path in shard_paths + reinf_paths + [reinf_merged_path]: + shard_paths = [ + processed_dir / f"rwthdbis_onto_type_part{index}.json" + for index in range(worker_count) + ] + re_shard_paths = [ + processed_dir / f"rwthdbis_onto_re_inference{index}.json" + for index in range(worker_count) + ] + re_merged_path = processed_dir / "rwthdbis_onto_Types_re_inference.json" + + # Remove any leftover shards + for path in shard_paths + re_shard_paths + [re_merged_path]: try: if path.exists(): path.unlink() @@ -766,7 +1054,7 @@ def preprocess_context_from_ontology( self._execute_for_terms( terms=unique_terms, merged_path=merged_path, - part_paths=shard_paths, + shard_paths=shard_paths, provider=provider, dataset_name=dataset_name, num_workers=worker_count, @@ -776,13 +1064,15 @@ def preprocess_context_from_ontology( while retry_round < max_retries: remaining_count = self._re_infer_short_entries( merged_path=merged_path, - re_part_paths=reinf_paths, - re_merged_path=reinf_merged_path, + re_shard_paths=re_shard_paths, + re_merged_path=re_merged_path, provider=provider, dataset_name=dataset_name, num_workers=worker_count, ) - print(f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}") + print( + f"[Preprocess] Re-infer round {retry_round + 1} done. Remaining short entries: {remaining_count}" + ) retry_round += 1 if remaining_count == 0: break diff --git a/ontolearner/learner/taxonomy_discovery/sbunlp.py b/ontolearner/learner/taxonomy_discovery/sbunlp.py index 9fc520d..660ec6e 100644 --- a/ontolearner/learner/taxonomy_discovery/sbunlp.py +++ b/ontolearner/learner/taxonomy_discovery/sbunlp.py @@ -1,45 +1,33 @@ # Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# License: MIT import os import re import json -import importlib.util -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional import torch -from transformers import AutoTokenizer, AutoModelForCausalLM - +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from ...base import AutoLearner + class SBUNLPFewShotLearner(AutoLearner): """ - Taxonomy-discovery via N×M batch prompting with a small Qwen model. - - Lifecycle - --------- - fit(): - Cache + clean training parent–child pairs. - predict(): - Chunk (train pairs × test terms), prompt per chunk pair, parse, merge, - and deduplicate predicted relations. + Few-shot taxonomy discovery via N×M batch prompting. + + This learner: + - Caches & cleans gold parent–child pairs during `fit`. + - Splits (train pairs × test terms) into a grid of chunks. + - Builds an instruction prompt per grid cell with few-shot JSON examples. + - Generates and parses model outputs as JSON relations. + - Merges & deduplicates all predicted edges. """ def __init__( self, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", try_4bit: bool = True, + device: str = "cpu", num_train_chunks: int = 7, num_test_chunks: int = 7, max_new_tokens: int = 140, @@ -50,88 +38,117 @@ def __init__( output_dir: Optional[str] = None, **kwargs: Any, ) -> None: + """ + Initialize the learner and core generation / batching settings. + + Args: + model_name: HF id/path of the causal LLM (e.g., Qwen Instruct). + try_4bit: If True and on CUDA, load with 4-bit NF4 quantization. + device: "cpu" or "cuda" for model execution. + num_train_chunks: Number of chunks for the gold (parent, child) bank. + num_test_chunks: Number of chunks for the test term list. + max_new_tokens: Max new tokens to generate per prompt call. + max_input_tokens: Clip the *input* prompt to this many tokens (tail kept). + temperature: Sampling temperature; 0.0 uses greedy decoding. + top_p: Nucleus sampling parameter (used when temperature > 0). + limit_num_prompts: Optional hard cap on prompts issued (debug/cost). + output_dir: Optional directory to save per-batch JSON predictions. + **kwargs: Forwarded to the base class. + """ super().__init__(**kwargs) self.model_name = model_name self.try_4bit = try_4bit + self.device = device self.num_train_chunks = num_train_chunks self.num_test_chunks = num_test_chunks - self.max_new_tokens = max_new_tokens self.max_input_tokens = max_input_tokens self.temperature = temperature self.top_p = top_p self.limit_num_prompts = limit_num_prompts - self.output_dir = output_dir self.tokenizer: Optional[AutoTokenizer] = None self.model: Optional[AutoModelForCausalLM] = None - self.device = "cuda" if torch.cuda.is_available() else "cpu" - self.train_pairs_clean: List[Dict[str, str]] = [] - # ----------------------- small helpers ---------------------- - def _clean_pairs(pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: - """ - Normalize, drop empty or self-relations, and deduplicate by (parent, child). + def _clean_pairs(self, pair_rows: List[Dict[str, str]]) -> List[Dict[str, str]]: """ - cleaned_pairs: List[Dict[str, str]] = [] - seen_parent_child: set[Tuple[str, str]] = set() + Normalize, filter, and deduplicate relation pairs. - for pair_record in pair_rows or []: - if not isinstance(pair_record, dict): - continue + Operations: + - Cast 'parent'/'child' to strings and strip whitespace. + - Drop rows with empty values. + - Drop self-relations (case-insensitive parent == child). + - Deduplicate by lowercase (parent, child). - parent_label = str(pair_record.get("parent", "")).strip() - child_label = str(pair_record.get("child", "")).strip() - if not parent_label or not child_label: - continue + Args: + pair_rows: Raw list of dicts with at least 'parent' and 'child'. - normalized_key = (parent_label.lower(), child_label.lower()) - if normalized_key[0] == normalized_key[1]: # parent==child + Returns: + Cleaned list of {'parent','child'} dicts. + """ + cleaned, seen = [], set() + for rec in pair_rows or []: + if not isinstance(rec, dict): + continue + p = str(rec.get("parent", "")).strip() + c = str(rec.get("child", "")).strip() + if not p or not c: continue - if normalized_key in seen_parent_child: + key = (p.lower(), c.lower()) + if key[0] == key[1] or key in seen: continue + seen.add(key) + cleaned.append({"parent": p, "child": c}) + return cleaned - seen_parent_child.add(normalized_key) - cleaned_pairs.append({"parent": parent_label, "child": child_label}) + def _chunk_list(self, items: List[Any], num_chunks: int) -> List[List[Any]]: + """ + Split a list into `num_chunks` near-equal contiguous parts. - return cleaned_pairs + Args: + items: Sequence to split. + num_chunks: Number of chunks to produce; if <= 0, returns [items]. - def _chunk_list(items: List[Any], num_chunks: int) -> List[List[Any]]: - """ - Split `items` into `num_chunks` near-equal parts. Some chunks may be empty. + Returns: + List of chunks (some may be empty if len(items) < num_chunks). """ if num_chunks <= 0: return [items] - total_items = len(items) - base_size, remainder = divmod(total_items, num_chunks) - - chunks: List[List[Any]] = [] - start_index = 0 - for chunk_index in range(num_chunks): - current_size = base_size + (1 if chunk_index < remainder else 0) - end_index = start_index + current_size - chunks.append(items[start_index:end_index]) - start_index = end_index - return chunks - - def _ensure_dir(self, maybe_path: Optional[str]) -> None: - if maybe_path: - os.makedirs(maybe_path, exist_ok=True) - - # ---------------------- model load/gen ---------------------- - def load(self, **_: Any) -> None: + n = len(items) + base, rem = divmod(n, num_chunks) + out, start = [], 0 + for i in range(num_chunks): + size = base + (1 if i < rem else 0) + out.append(items[start : start + size]) + start += size + return out + + def _ensure_dir(self, path: Optional[str]) -> None: """ - Load tokenizer/model; use 4-bit nf4 on CUDA if available + requested. + Create a directory if `path` is a non-empty string. + + Args: + path: Directory to create (recursively). Ignored if falsy. """ - bnb_available = importlib.util.find_spec("bitsandbytes") is not None - use_4bit_quant = bool(self.try_4bit and bnb_available and self.device == "cuda") + if path: + os.makedirs(path, exist_ok=True) + def load(self, **_: Any) -> None: + """ + Load tokenizer and model; optionally enable 4-bit quantization. + + Assumes bitsandbytes is available if `try_4bit=True` on CUDA. + Sets tokenizer pad token if missing. Places model on GPU (device_map='auto') + when `device='cuda'`, otherwise on CPU. + + Args: + **_: Unused kwargs for interface compatibility. + """ quant_config = None - if use_4bit_quant: - from transformers import BitsAndBytesConfig + if self.try_4bit and self.device == "cuda": quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, @@ -140,8 +157,11 @@ def load(self, **_: Any) -> None: ) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - if self.tokenizer.pad_token is None: - self.tokenizer.pad_token = self.tokenizer.eos_token + if getattr(self.tokenizer, "pad_token_id", None) is None: + if getattr(self.tokenizer, "eos_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.eos_token + elif getattr(self.tokenizer, "unk_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.unk_token self.model = AutoModelForCausalLM.from_pretrained( self.model_name, @@ -149,12 +169,26 @@ def load(self, **_: Any) -> None: torch_dtype=(torch.float16 if self.device == "cuda" else torch.float32), quantization_config=quant_config, ) + if self.device == "cpu": + self.model.to("cpu") def _format_chat(self, user_text: str) -> str: """ - Wrap user text with the model's chat template (if present). + Wrap plain text with the model's chat template, if provided. + + Many instruction-tuned models expose `tokenizer.chat_template`. + If available, use it to construct a proper chat prompt; otherwise, + return the text unchanged. + + Args: + user_text: Content of the user message. + + Returns: + A generation-ready prompt string. """ - if hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None): + if hasattr(self.tokenizer, "apply_chat_template") and getattr( + self.tokenizer, "chat_template", None + ): return self.tokenizer.apply_chat_template( [{"role": "user", "content": user_text}], tokenize=False, @@ -165,17 +199,31 @@ def _format_chat(self, user_text: str) -> str: @torch.no_grad() def _generate(self, prompt_text: str) -> str: """ - Single prompt → model text. Clips *input* tokens to avoid overflow. - """ - formatted_prompt = self._format_chat(prompt_text) - prompt_token_ids = self.tokenizer(formatted_prompt, add_special_tokens=False, return_tensors=None)["input_ids"] - if len(prompt_token_ids) > self.max_input_tokens: - prompt_token_ids = prompt_token_ids[-self.max_input_tokens:] + Generate text for a single prompt, guarding input length. + + Steps: + 1) Format prompt via chat template (if present). + 2) Tokenize and clip the *input* to `max_input_tokens` (tail kept). + 3) Call `model.generate` with configured decoding params. + 4) Strip the echoed prompt from the decoded output (if present). - prompt_tensor = torch.tensor([prompt_token_ids]).to(self.model.device) + Args: + prompt_text: Textual prompt to feed the model. - generation = self.model.generate( - input_ids=prompt_tensor, + Returns: + Model continuation string (prompt-echo stripped when applicable). + """ + formatted = self._format_chat(prompt_text) + ids = self.tokenizer(formatted, add_special_tokens=False, return_tensors=None)[ + "input_ids" + ] + if len(ids) > self.max_input_tokens: + ids = ids[-self.max_input_tokens :] + device = next(self.model.parameters()).device + input_ids = torch.tensor([ids], device=device) + + out = self.model.generate( + input_ids=input_ids, max_new_tokens=self.max_new_tokens, do_sample=(self.temperature > 0.0), temperature=self.temperature, @@ -185,20 +233,37 @@ def _generate(self, prompt_text: str) -> str: use_cache=True, ) - decoded_full = self.tokenizer.decode(generation[0], skip_special_tokens=True) - decoded_prompt = self.tokenizer.decode(prompt_tensor[0], skip_special_tokens=True) - return decoded_full[len(decoded_prompt):].strip() if decoded_full.startswith(decoded_prompt) else decoded_full.strip() + decoded_full = self.tokenizer.decode(out[0], skip_special_tokens=True) + decoded_prompt = self.tokenizer.decode(input_ids[0], skip_special_tokens=True) + return ( + decoded_full[len(decoded_prompt) :].strip() + if decoded_full.startswith(decoded_prompt) + else decoded_full.strip() + ) - # ------------------ prompt build & parsing ------------------ - def _build_prompt(train_pairs_chunk: List[Dict[str, str]], - test_terms_chunk: List[str]) -> str: + def _build_prompt( + self, + train_pairs_chunk: List[Dict[str, str]], + test_terms_chunk: List[str], + ) -> str: """ - Few-shot with JSON examples + a block of test terms. - The model must return ONLY a JSON array of {parent, child}. + Construct a few-shot prompt with JSON examples and test terms. + + The prompt: + - Shows several gold (parent, child) examples in JSON. + - Lists the test terms (one per line) between [PAIR] tags. + - Instructs to return ONLY a JSON array of {'parent','child'}. + + Args: + train_pairs_chunk: Cleaned training relations for examples. + test_terms_chunk: The current chunk of test terms. + + Returns: + The fully formatted prompt string. """ examples_json = json.dumps(train_pairs_chunk, ensure_ascii=False, indent=2) - test_types_block = "\n".join(test_terms_chunk) - return ( + test_block = "\n".join(test_terms_chunk) + prompt = ( "From this file, extract all parent–child relations like in the examples.\n" "Return ONLY a JSON array of objects with keys 'parent' and 'child'.\n" "Output format:\n" @@ -210,108 +275,128 @@ def _build_prompt(train_pairs_chunk: List[Dict[str, str]], f"{examples_json}\n\n" "TEST TYPES (between [PAIR] tags):\n" "[PAIR]\n" - f"{test_types_block}\n" + f"{test_block}\n" "[PAIR]\n" "Return only JSON." ) + return prompt - def _parse_pairs(model_text: str) -> List[Dict[str, str]]: - """ - Parse a model response into a list of {'parent','child'} dicts. + def _parse_pairs(self, text: str) -> List[Dict[str, str]]: """ - def deduplicate_and_normalize(dict_list: List[Dict[str, str]]) -> List[Dict[str, str]]: - return SBUNLPFewShotLearner._clean_pairs(dict_list) + Parse a generation string into a list of relation dicts. - response_text = model_text.strip() + Parsing strategy: + 1) Try to parse the entire string as JSON; expect a list. + 2) Else, regex-extract the outermost JSON-like array and parse that. + 3) On failure, return an empty list. - # 1) Direct JSON list + Args: + text: Raw model output. + + Returns: + Cleaned list of {'parent','child'} dicts (possibly empty). + """ + text = text.strip() try: - maybe_json = json.loads(response_text) - if isinstance(maybe_json, list): - return deduplicate_and_normalize(maybe_json) + obj = json.loads(text) + if isinstance(obj, list): + return self._clean_pairs(obj) except Exception: pass - - # 2) Find outermost [ ... ] and parse that - outer_list_match = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", response_text) - if outer_list_match: + m = re.search(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)*\]", text) + if m: try: - array_json = json.loads(outer_list_match.group(0)) - if isinstance(array_json, list): - return deduplicate_and_normalize(array_json) + obj = json.loads(m.group(0)) + if isinstance(obj, list): + return self._clean_pairs(obj) except Exception: pass - - # 3) Nothing parsable return [] - # --------------------- AutoLearner hooks -------------------- def fit(self, train_data: Any, task: str, ontologizer: bool = True): """ - Build the training example bank (parent–child pairs). + Cache and clean gold relations for few-shot prompting. + + For `task == "taxonomy-discovery"`: + - If `ontologizer=True`, convert ontology-like input into + a list of {'parent','child'} via the base helper. + - Otherwise, accept a user-provided list directly. + - Store a cleaned, deduplicated bank in `self.train_pairs_clean`. + + Args: + train_data: Ontology-like object or list of relation dicts. + task: Task selector (expects "taxonomy-discovery"). + ontologizer: Whether to transform ontology inputs. + + Returns: + None. (State is stored on the instance.) """ if task != "taxonomy-discovery": return super().fit(train_data, task, ontologizer) - if ontologizer: - # Convert ontology object → list of {"parent","child"} gold pairs - gold_pairs_from_ontology = self.tasks_ground_truth_former( - train_data, task="taxonomy-discovery" - ) - self.train_pairs_clean = self._clean_pairs(gold_pairs_from_ontology) + gold = self.tasks_ground_truth_former(train_data, task="taxonomy-discovery") + self.train_pairs_clean = self._clean_pairs(gold) else: - # Already a Python list of dicts self.train_pairs_clean = self._clean_pairs(train_data) - def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[Any]: + def _taxonomy_discovery( + self, data: Any, test: bool = False + ) -> Optional[List[Dict[str, str]]]: """ - Main prediction path. Returns a deduplicated list of relations. + Run few-shot inference (test=True) or no-op during training. + + Inference steps: + - Ensure tokenizer/model are loaded. + - Normalize `data` to a list of test terms (via base helper if needed). + - Create the N×M grid across (train_pairs_chunk × test_terms_chunk). + - For each cell: build prompt → generate → parse → (optionally) save. + - Merge and deduplicate all predicted pairs before returning. + + Args: + data: Test input (ontology-like, list of strings, or mixed). + test: If True, perform prediction; otherwise return None. + + Returns: + On `test=True`: deduplicated list of {'parent','child'}. + On `test=False`: None. """ if not test: return None - if self.model is None or self.tokenizer is None: self.load() - # Build test vocabulary of types/terms if isinstance(data, list) and (len(data) == 0 or isinstance(data[0], str)): - test_type_list: List[str] = data + test_terms: List[str] = data else: - test_type_list = super().tasks_data_former( + test_terms = super().tasks_data_former( data=data, task="taxonomy-discovery", test=True ) - # Create N×M grid train_chunks = self._chunk_list(self.train_pairs_clean, self.num_train_chunks) - test_chunks = self._chunk_list(test_type_list, self.num_test_chunks) + test_chunks = self._chunk_list(test_terms, self.num_test_chunks) self._ensure_dir(self.output_dir) - merged_predicted_pairs: List[Dict[str, str]] = [] - issued_prompt_count = 0 + merged: List[Dict[str, str]] = [] + issued = 0 - for train_chunk_index, train_pairs_chunk in enumerate(train_chunks, start=1): - for test_chunk_index, test_terms_chunk in enumerate(test_chunks, start=1): - issued_prompt_count += 1 - if self.limit_num_prompts and issued_prompt_count > self.limit_num_prompts: + for ti, tr in enumerate(train_chunks, 1): + for si, ts in enumerate(test_chunks, 1): + issued += 1 + if self.limit_num_prompts and issued > self.limit_num_prompts: break + prompt = self._build_prompt(tr, ts) + resp = self._generate(prompt) + pairs = self._parse_pairs(resp) - prompt_text = self._build_prompt(train_pairs_chunk, test_terms_chunk) - model_response = self._generate(prompt_text) - parsed_relation_pairs = self._parse_pairs(model_response) - - # Optional per-batch dump for debugging if self.output_dir: - batch_json_path = os.path.join( - self.output_dir, f"pairs_T{train_chunk_index}_S{test_chunk_index}.json" - ) - with open(batch_json_path, "w", encoding="utf-8") as fp: - json.dump(parsed_relation_pairs, fp, ensure_ascii=False, indent=2) + path = os.path.join(self.output_dir, f"pairs_T{ti}_S{si}.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(pairs, f, ensure_ascii=False, indent=2) - merged_predicted_pairs.extend(parsed_relation_pairs) + merged.extend(pairs) - if self.limit_num_prompts and issued_prompt_count >= (self.limit_num_prompts or 0): + if self.limit_num_prompts and issued >= (self.limit_num_prompts or 0): break - # Deduplicate final list - return self._clean_pairs(merged_predicted_pairs) + return self._clean_pairs(merged) diff --git a/ontolearner/learner/taxonomy_discovery/skhnlp.py b/ontolearner/learner/taxonomy_discovery/skhnlp.py index fbe53b4..c242aab 100644 --- a/ontolearner/learner/taxonomy_discovery/skhnlp.py +++ b/ontolearner/learner/taxonomy_discovery/skhnlp.py @@ -23,6 +23,7 @@ from typing import Any, Optional, List, Tuple, Dict from transformers import ( AutoTokenizer, + AutoModelForSequenceClassification, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification, @@ -35,10 +36,20 @@ from ...utils import taxonomy_split, train_test_split as ontology_split from ...data_structure import OntologyData, TaxonomicRelation + class SKHNLPTaxonomyPrompts(AutoPrompt): - """Builds the 7 taxonomy prompts used during fine-tuning / inference.""" + """Builds the 7 taxonomy prompts used during fine-tuning / inference. + + The class stores a small inventory of prompt templates that verbalize the + (parent, child) relationship using different phrasings. Each template ends + with a masked token slot intended for True/False classification. + """ + def __init__(self) -> None: - super().__init__(prompt_template="{parent} is the superclass of {child}. This statement is [MASK].") + """Initialize prompt templates and the default prompt in the base class.""" + super().__init__( + prompt_template="{parent} is the superclass of {child}. This statement is [MASK]." + ) self.templates: List[str] = [ "{parent} is the superclass of {child}. This statement is [MASK].", "{child} is a subclass of {parent}. This statement is [MASK].", @@ -49,7 +60,17 @@ def __init__(self) -> None: "{parent} is an ancestor class of {child}. This statement is [MASK].", ] - def make(self, parent: str, child: str, template_idx: int) -> str: + def format(self, parent: str, child: str, template_idx: int) -> str: + """Render a prompt for a (parent, child) pair using a specific template. + + Args: + parent: The parent/superclass label. + child: The child/subclass label. + template_idx: Index into the internal `templates` list. + + Returns: + The fully formatted prompt string. + """ return self.templates[template_idx].format(parent=parent, child=child) @@ -66,20 +87,18 @@ class SKHNLPSequentialFTLearner(AutoLearner): * PREDICT/TEST: pairwise binary classification (returns label + score). """ - # Fixed constants defining data split size and the proportional mix of - # negative sample types (reversed vs. manipulated) for balancing. - _EVAL_FRACTION: float = 0.16 - _NEG_RATIO_REVERSED: float = 1/3 - _NEG_RATIO_MANIPULATED: float = 2/3 - def __init__( self, # core model_name: str = "bert-large-uncased", n_prompts: int = 7, random_state: int = 1403, - device: Optional[str] = None, # "cuda" | "cpu" | None (auto) - + num_labels: int = 2, + device: str = "cpu", # "cuda" | "cpu" | None (auto) + # data split & negative sampling (now configurable) + eval_fraction: float = 0.16, + neg_ratio_reversed: float = 1 / 3, + neg_ratio_manipulated: float = 2 / 3, # ---- expose TrainingArguments as individual user-defined args ---- output_dir: str = "./results/", num_train_epochs: int = 1, @@ -92,12 +111,52 @@ def __init__( eval_strategy: str = "epoch", save_strategy: str = "epoch", load_best_model_at_end: bool = True, + use_fast_tokenizer: Optional[bool] = None, + trust_remote_code: bool = False, ) -> None: + """Configure the sequential fine-tuning learner. + + Args: + model_name: HF model id or local path for the BERT backbone. + n_prompts: Number of prompt variants to iterate over sequentially. + random_state: RNG seed for shuffling/sampling steps. + num_labels: Number of classes for the classifier head. + device: Force device ('cuda' or 'cpu'). If None, auto-detects CUDA. + eval_fraction: Fraction of positives to hold out for evaluation. + neg_ratio_reversed: Proportion of reversed-parent negatives vs positives. + neg_ratio_manipulated: Proportion of random-parent negatives vs positives. + output_dir: Directory where HF Trainer writes checkpoints/outputs. + num_train_epochs: Number of epochs per prompt. + per_device_train_batch_size: Training batch size per device. + per_device_eval_batch_size: Evaluation batch size per device. + warmup_steps: Linear warmup steps for LR scheduler. + weight_decay: Weight decay coefficient. + logging_dir: Directory for Trainer logs. + logging_steps: Interval for log events (in steps). + eval_strategy: Evaluation schedule ('no', 'steps', 'epoch'). + save_strategy: Checkpoint save schedule ('no', 'steps', 'epoch'). + load_best_model_at_end: Whether to restore the best checkpoint. + use_fast_tokenizer: Force fast/slow tokenizer. If None, try fast then fallback to slow. + Notes: + The model is fine-tuned *sequentially* across prompt columns. + You can control the eval split and negative sampling mix via + `eval_fraction`, `neg_ratio_reversed`, and `neg_ratio_manipulated`. + """ super().__init__() self.model_name = model_name self.n_prompts = n_prompts self.random_state = random_state - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.num_labels = num_labels + self.device = device + + # user-tunable ratios / split + self._eval_fraction = float(eval_fraction) + self._neg_ratio_reversed = float(neg_ratio_reversed) + self._neg_ratio_manipulated = float(neg_ratio_manipulated) + if not (0.0 < self._eval_fraction < 1.0): + raise ValueError("eval_fraction must be in (0, 1).") + if self._neg_ratio_reversed < 0 or self._neg_ratio_manipulated < 0: + raise ValueError("neg_ratio_* must be >= 0.") self.tokenizer: Optional[BertTokenizer] = None self.model: Optional[BertForSequenceClassification] = None @@ -109,6 +168,8 @@ def __init__( # Keep last train/eval tables for inspection self._last_train: Optional[pd.DataFrame] = None self._last_eval: Optional[pd.DataFrame] = None + self.trust_remote_code = bool(trust_remote_code) + self.use_fast_tokenizer = use_fast_tokenizer random.seed(self.random_state) @@ -128,19 +189,77 @@ def __init__( ) def load(self, model_id: Optional[str] = None, **_: Any) -> None: - """Load tokenizer and model; move model to the requested device.""" + """Load tokenizer & model in a backbone-agnostic way; move model to self.device.""" model_id = model_id or self.model_name - self.tokenizer = BertTokenizer.from_pretrained(model_id) - self.model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2) + + # ---- Tokenizer (robust fast→slow fallback unless explicitly set) ---- + if self.use_fast_tokenizer is None: + try: + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, use_fast=True, trust_remote_code=self.trust_remote_code + ) + except Exception as fast_err: + print( + f"[tokenizer] Fast tokenizer failed: {fast_err}. Falling back to slow tokenizer..." + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, use_fast=False, trust_remote_code=self.trust_remote_code + ) + else: + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, + use_fast=self.use_fast_tokenizer, + trust_remote_code=self.trust_remote_code, + ) + + # Ensure pad token exists (some models lack it) + if getattr(self.tokenizer, "pad_token", None) is None: + # Try sensible fallbacks + fallback = ( + getattr(self.tokenizer, "eos_token", None) + or getattr(self.tokenizer, "sep_token", None) + or getattr(self.tokenizer, "cls_token", None) + ) + if fallback is not None: + self.tokenizer.pad_token = fallback + + # ---- Model (classifier head sized to self.num_labels) ---- + self.model = AutoModelForSequenceClassification.from_pretrained( + model_id, + num_labels=self.num_labels, + trust_remote_code=self.trust_remote_code, + # Allows swapping in a new head size even if the checkpoint differs + ignore_mismatched_sizes=True, + ) + + # Make sure padding ids line up + if ( + getattr(self.model.config, "pad_token_id", None) is None + and getattr(self.tokenizer, "pad_token_id", None) is not None + ): + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + # Set problem type (single-label classification by default) + # If you plan multi-label, you'd switch to "multi_label_classification" self.model.config.problem_type = "single_label_classification" - # place on device chosen by user (or auto) - target_device = self.device - if target_device not in {"cuda", "cpu"}: - target_device = "cuda" if torch.cuda.is_available() else "cpu" - self.model.to(target_device) + # Move to target device + self.model.to(self.device) def tasks_ground_truth_former(self, data: Any, task: str) -> Any: + """Normalize ground-truth inputs for 'taxonomy-discovery'. + + Supports DataFrame with columns ['parent','child',('label')], + list of dicts, or falls back to the base class behavior. + + Args: + data: Input object to normalize. + task: Task name, passed from the outer pipeline. + + Returns: + A list of dictionaries with keys 'parent', 'child', and optionally + 'label' when present in the input. + """ if task != "taxonomy-discovery": return super().tasks_ground_truth_former(data, task) @@ -150,15 +269,29 @@ def tasks_ground_truth_former(self, data: Any, task: str) -> Any: {"parent": p, "child": c, "label": bool(lbl)} for p, c, lbl in zip(data["parent"], data["child"], data["label"]) ] - return [{"parent": p, "child": c} for p, c in zip(data["parent"], data["child"])] + return [ + {"parent": p, "child": c} for p, c in zip(data["parent"], data["child"]) + ] if isinstance(data, list): return data return super().tasks_ground_truth_former(data, task) - def _make_negatives(self, positives_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Return (reversed_df, manipulated_df).""" + def _make_negatives( + self, positives_df: pd.DataFrame + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create two types of negatives from a positives table. + + Returns: + A tuple `(reversed_df, manipulated_df)` where: + - `reversed_df`: pairs with parent/child columns swapped, label=False. + - `manipulated_df`: pairs with the parent replaced by a random + *different* parent from the same pool, label=False. + + Notes: + The input DataFrame must contain columns ['parent', 'child']. + """ unique_parents = positives_df["parent"].unique().tolist() def as_reversed(df: pd.DataFrame) -> pd.DataFrame: @@ -171,6 +304,7 @@ def with_random_parent(df: pd.DataFrame) -> pd.DataFrame: def pick_other_parent(p: str) -> str: pool = [x for x in unique_parents if x != p] return random.choice(pool) if pool else p + out = df.copy() out["parent"] = out["parent"].apply(pick_other_parent) out["label"] = False @@ -184,10 +318,23 @@ def _balance_with_negatives( reversed_df: pd.DataFrame, manipulated_df: pd.DataFrame, ) -> pd.DataFrame: - """Combine positives and negatives with the same ratios as before.""" + """Combine positives with negatives using configured ratios. + + Sampling ratios are defined by the instance settings + `self._neg_ratio_reversed` and `self._neg_ratio_manipulated`, + keeping the positives count unchanged. + + Args: + positives_df: Positive pairs with `label=True`. + reversed_df: Negative pairs produced by flipping parent/child. + manipulated_df: Negative pairs with randomly reassigned parents. + + Returns: + A deduplicated, shuffled DataFrame with a class-balanced mix. + """ n_pos = len(positives_df) - n_rev = int(n_pos * self._NEG_RATIO_REVERSED) - n_man = int(n_pos * self._NEG_RATIO_MANIPULATED) + n_rev = int(n_pos * self._neg_ratio_reversed) + n_man = int(n_pos * self._neg_ratio_manipulated) combined = pd.concat( [ @@ -197,26 +344,75 @@ def _balance_with_negatives( ], ignore_index=True, ) - combined = combined.drop_duplicates(subset=["parent", "child", "label"]).reset_index(drop=True) + combined = combined.drop_duplicates( + subset=["parent", "child", "label"] + ).reset_index(drop=True) return combined def _add_prompt_columns(self, df: pd.DataFrame) -> pd.DataFrame: + """Append one column per prompt variant to the given pairs table. + + For each row `(parent, child)`, creates columns `prompt_1 ... prompt_n`. + + Args: + df: Input DataFrame with columns ['parent', 'child', ...]. + + Returns: + A copy of `df` including the newly added prompt columns. + """ out = df.copy() for i in range(self.n_prompts): - out[f"prompt_{i+1}"] = out.apply( - lambda r, k=i: self.prompter.make(r["parent"], r["child"], k), axis=1 + out[f"prompt_{i + 1}"] = out.apply( + lambda r, k=i: self.prompter.format(r["parent"], r["child"], k), axis=1 ) return out - def _df_from_relations(relations: List[TaxonomicRelation], label: bool = True) -> pd.DataFrame: + def _df_from_relations( + self, relations: List[TaxonomicRelation], label: bool = True + ) -> pd.DataFrame: + """Convert a list of `TaxonomicRelation` to a DataFrame. + + Args: + relations: Iterable of `TaxonomicRelation(parent, child)`. + label: Class label to assign to all resulting rows. + + Returns: + DataFrame with columns ['parent', 'child', 'label']. + """ if not relations: return pd.DataFrame(columns=["parent", "child", "label"]) - return pd.DataFrame([{"parent": r.parent, "child": r.child, "label": label} for r in relations]) + return pd.DataFrame( + [{"parent": r.parent, "child": r.child, "label": label} for r in relations] + ) + + def _relations_from_df(self, df: pd.DataFrame) -> List[TaxonomicRelation]: + """Convert a DataFrame to a list of `TaxonomicRelation`. - def _relations_from_df(df: pd.DataFrame) -> List[TaxonomicRelation]: - return [TaxonomicRelation(parent=p, child=c) for p, c in zip(df["parent"], df["child"])] + Args: + df: DataFrame with columns ['parent', 'child']. - def _build_masked_prompt(self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]") -> str: + Returns: + List of `TaxonomicRelation` objects in row order. + """ + return [ + TaxonomicRelation(parent=p, child=c) + for p, c in zip(df["parent"], df["child"]) + ] + + def _build_masked_prompt( + self, parent: str, child: str, index_1_based: int, mask_token: str = "[MASK]" + ) -> str: + """Construct one of several True/False prompts with a mask token. + + Args: + parent: Parent label. + child: Child label. + index_1_based: 1-based index selecting a template. + mask_token: The token used to denote the masked label. + + Returns: + A formatted prompt string. + """ prompts_1based = [ f"{parent} is the superclass of {child}. This statement is {mask_token}.", f"{child} is a subclass of {parent}. This statement is {mask_token}.", @@ -226,18 +422,42 @@ def _build_masked_prompt(self, parent: str, child: str, index_1_based: int, mask f"{child} is a subtype of {parent}. This statement is {mask_token}.", f"{parent} is an ancestor class of {child}. This statement is {mask_token}.", f"{child} is a descendant classs of {child}. This statement is {mask_token}.", - f"\"{parent}\" is the superclass of \"{child}\". This statement is {mask_token}.", + f'"{parent}" is the superclass of "{child}". This statement is {mask_token}.', ] return prompts_1based[index_1_based - 1] @torch.no_grad() def _predict_prompt_true_false(self, sentence: str) -> bool: + """Run a single True/False prediction on a prompt. + + Args: + sentence: Fully formatted prompt text. + + Returns: + True iff the predicted class index is 1 (positive). + """ enc = self.tokenizer(sentence, return_tensors="pt").to(self.model.device) logits = self.model(**enc).logits predicted_label = torch.argmax(logits, dim=1).item() return predicted_label == 1 def _select_parent_via_prompts(self, child: str) -> str: + """Select the most likely parent for a given child via prompt voting. + + The procedure: + 1) Generate prompts for each candidate parent at increasing "levels". + 2) Accumulate votes from the True/False classifier. + 3) Resolve ties by recursing to the next level; after 4 levels, break ties randomly. + + Args: + child: The child label whose parent should be predicted. + + Returns: + The chosen parent string. + + Raises: + AssertionError: If candidate parents were not initialized. + """ assert self._candidate_parents, "Candidate parents not initialized." scores: dict[str, int] = {p: 0 for p in self._candidate_parents} @@ -247,14 +467,18 @@ def prompt_indices_for_level(level: int) -> List[int]: return [2 * level, 2 * level + 1] def recurse(active_parents: List[str], level: int) -> str: - idxs = [i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts] + idxs = [ + i for i in prompt_indices_for_level(level) if 1 <= i <= self.n_prompts + ] if idxs: for parent in active_parents: votes = sum( 1 for idx in idxs if self._predict_prompt_true_false( - self._build_masked_prompt(parent=parent, child=child, index_1_based=idx) + self._build_masked_prompt( + parent=parent, child=child, index_1_based=idx + ) ) ) scores[parent] += votes @@ -277,6 +501,15 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): TEST: - OntologyData -> parent selection: [{'parent': predicted, 'child': child}] - DataFrame/list -> binary pair classification with 'label' + 'score' + + Args: + data: One of {OntologyData, pandas.DataFrame, list[dict], list[tuple]}. + test: If True, run inference; otherwise perform training. + + Returns: + - On training: None (model is fine-tuned in-place). + - On inference with OntologyData: list of {'parent','child'} predictions. + - On inference with pairs: list of dicts including 'label' and 'score'. """ is_ontology_object = isinstance(data, OntologyData) @@ -298,7 +531,9 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): if self._candidate_parents is None: self._candidate_parents = parents_in_call else: - self._candidate_parents = sorted(set(self._candidate_parents).union(parents_in_call)) + self._candidate_parents = sorted( + set(self._candidate_parents).union(parents_in_call) + ) else: if self._candidate_parents is None: self._candidate_parents = parents_in_call @@ -317,7 +552,7 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): true_probs_by_prompt: List[torch.Tensor] = [] for i in range(self.n_prompts): - col = f"prompt_{i+1}" + col = f"prompt_{i + 1}" enc = self.tokenizer( prompts_df[col].tolist(), return_tensors="pt", @@ -333,18 +568,35 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): results: List[dict[str, Any]] = [] for p, c, s, yhat in zip( - pairs_df["parent"], pairs_df["child"], avg_true_prob.tolist(), predicted_bool + pairs_df["parent"], + pairs_df["child"], + avg_true_prob.tolist(), + predicted_bool, ): - results.append({"parent": p, "child": c, "label": int(bool(yhat)), "score": float(s)}) + results.append( + { + "parent": p, + "child": c, + "label": int(bool(yhat)), + "score": float(s), + } + ) return results if isinstance(data, OntologyData): train_onto, eval_onto = ontology_split( - data, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + data, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, ) - train_pos_rel: List[TaxonomicRelation] = getattr(train_onto.type_taxonomies, "taxonomies", []) or [] - eval_pos_rel: List[TaxonomicRelation] = getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + train_pos_rel: List[TaxonomicRelation] = ( + getattr(train_onto.type_taxonomies, "taxonomies", []) or [] + ) + eval_pos_rel: List[TaxonomicRelation] = ( + getattr(eval_onto.type_taxonomies, "taxonomies", []) or [] + ) train_pos_df = self._df_from_relations(train_pos_rel, label=True) eval_pos_df = self._df_from_relations(eval_pos_rel, label=True) @@ -360,11 +612,17 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): else: if "label" not in pairs_df.columns or pairs_df["label"].nunique() == 1: - positives_df = pairs_df[pairs_df.get("label", True)][["parent", "child"]].copy() + positives_df = pairs_df[pairs_df.get("label", True)][ + ["parent", "child"] + ].copy() pos_rel = self._relations_from_df(positives_df) tr_rel, ev_rel = taxonomy_split( - pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + pos_rel, + train_terms=None, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, ) train_pos_df = self._df_from_relations(tr_rel, label=True) eval_pos_df = self._df_from_relations(ev_rel, label=True) @@ -372,8 +630,12 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): tr_rev_df, tr_man_df = self._make_negatives(train_pos_df) ev_rev_df, ev_man_df = self._make_negatives(eval_pos_df) - train_df = self._balance_with_negatives(train_pos_df, tr_rev_df, tr_man_df) - eval_df = self._balance_with_negatives(eval_pos_df, ev_rev_df, ev_man_df) + train_df = self._balance_with_negatives( + train_pos_df, tr_rev_df, tr_man_df + ) + eval_df = self._balance_with_negatives( + eval_pos_df, ev_rev_df, ev_man_df + ) train_df = self._add_prompt_columns(train_df) eval_df = self._add_prompt_columns(eval_df) @@ -383,16 +645,30 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): pos_rel = self._relations_from_df(positives_df) tr_rel, ev_rel = taxonomy_split( - pos_rel, train_terms=None, test_size=self._EVAL_FRACTION, random_state=self.random_state, verbose=False + pos_rel, + train_terms=None, + test_size=self._eval_fraction, + random_state=self.random_state, + verbose=False, ) train_pos_df = self._df_from_relations(tr_rel, label=True) eval_pos_df = self._df_from_relations(ev_rel, label=True) negatives_df = pairs_df[pairs_df["label"]][["parent", "child"]].copy() - negatives_df = negatives_df.sample(frac=1.0, random_state=self.random_state).reset_index(drop=True) - - n_eval_neg = max(1, int(len(negatives_df) * self._EVAL_FRACTION)) if len(negatives_df) > 0 else 0 - eval_neg_df = negatives_df.iloc[:n_eval_neg].copy() if n_eval_neg > 0 else negatives_df.iloc[:0].copy() + negatives_df = negatives_df.sample( + frac=1.0, random_state=self.random_state + ).reset_index(drop=True) + + n_eval_neg = ( + max(1, int(len(negatives_df) * self._eval_fraction)) + if len(negatives_df) > 0 + else 0 + ) + eval_neg_df = ( + negatives_df.iloc[:n_eval_neg].copy() + if n_eval_neg > 0 + else negatives_df.iloc[:0].copy() + ) train_neg_df = negatives_df.iloc[n_eval_neg:].copy() train_neg_df["label"] = False @@ -410,21 +686,36 @@ def _taxonomy_discovery(self, data: Any, test: bool = False): # Sequential fine-tuning across prompts for i in range(self.n_prompts): - prompt_col = f"prompt_{i+1}" - train_ds = Dataset.from_pandas(train_df[[prompt_col, "label"]].reset_index(drop=True)) - eval_ds = Dataset.from_pandas(eval_df[[prompt_col, "label"]].reset_index(drop=True)) + prompt_col = f"prompt_{i + 1}" + train_ds = Dataset.from_pandas( + train_df[[prompt_col, "label"]].reset_index(drop=True) + ) + eval_ds = Dataset.from_pandas( + eval_df[[prompt_col, "label"]].reset_index(drop=True) + ) train_ds = train_ds.rename_column("label", "labels") eval_ds = eval_ds.rename_column("label", "labels") def tokenize_batch(batch): - return self.tokenizer(batch[prompt_col], padding="max_length", truncation=True) + """Tokenize a batch for the current prompt column with truncation/padding.""" + return self.tokenizer( + batch[prompt_col], padding="max_length", truncation=True + ) - train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) - eval_ds = eval_ds.map(tokenize_batch, batched=True, remove_columns=[prompt_col]) + train_ds = train_ds.map( + tokenize_batch, batched=True, remove_columns=[prompt_col] + ) + eval_ds = eval_ds.map( + tokenize_batch, batched=True, remove_columns=[prompt_col] + ) - train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) - eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + train_ds.set_format( + type="torch", columns=["input_ids", "attention_mask", "labels"] + ) + eval_ds.set_format( + type="torch", columns=["input_ids", "attention_mask", "labels"] + ) trainer = Trainer( model=self.model, @@ -481,13 +772,25 @@ class SKHNLPZSLearner(AutoLearner): def __init__( self, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct", - device: Optional[str] = None, # "cuda" | "cpu" | None (auto) + device: Optional[str] = None, # "cuda" | "cpu" | None (auto) max_new_tokens: int = 16, - save_path: Optional[str] = None, # directory or full path + save_path: Optional[str] = None, # directory or full path verbose: bool = True, - normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" + normalize_mode: str = "none", # "none" | "substring" | "levenshtein" | "auto" random_state: int = 1403, ) -> None: + """Configure the zero-shot learner. + + Args: + model_name: HF model id/path for the instruction-tuned causal LLM. + device: Force device ('cuda' or 'cpu'), else auto-detect. + max_new_tokens: Generation length budget for each completion. + save_path: Optional CSV path or directory for saving predictions. + verbose: If True, print progress messages. + normalize_mode: Post-processing for class names + ('none' | 'substring' | 'levenshtein' | 'auto'). + random_state: RNG seed for any sampling steps. + """ super().__init__() self.model_name = model_name self.verbose = verbose @@ -502,7 +805,7 @@ def __init__( if device is None: self._has_cuda = torch.cuda.is_available() else: - self._has_cuda = (device == "cuda") + self._has_cuda = device == "cuda" self._pipe_device = 0 if self._has_cuda else -1 self._model_device_map = {"": "cuda"} if self._has_cuda else None @@ -530,6 +833,13 @@ def __init__( def load(self, model_id: str = "") -> None: """ Load tokenizer, model, and text-generation pipeline. + + Args: + model_id: Optional HF id/path override; defaults to `self.model_name`. + + Side Effects: + Initializes the tokenizer and model, configures the generation + pipeline on CPU/GPU, and sets a pad token if absent. """ model_id = model_id or self.model_name if self.verbose: @@ -538,7 +848,10 @@ def load(self, model_id: str = "") -> None: self._tokenizer = AutoTokenizer.from_pretrained(model_id) # Ensure a pad token is set for generation - if self._tokenizer.pad_token_id is None and self._tokenizer.eos_token_id is not None: + if ( + self._tokenizer.pad_token_id is None + and self._tokenizer.eos_token_id is not None + ): self._tokenizer.pad_token = self._tokenizer.eos_token self._model = AutoModelForCausalLM.from_pretrained( @@ -558,10 +871,19 @@ def load(self, model_id: str = "") -> None: print("Device set to use", "cuda" if self._has_cuda else "cpu") print("[ZeroShotTaxonomyLearner] Model loaded.") - def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[List[Dict[str, str]]]: + def _taxonomy_discovery( + self, data: Any, test: bool = False + ) -> Optional[List[Dict[str, str]]]: """ Zero-shot prediction over all incoming rows (no filtering/augmentation). - Returns a list of dictionaries: [{'parent': predicted_label, 'child': child}, ...] + + Args: + data: One of {DataFrame, list[dict], list[tuple], Ontology-like}. + test: If False, training is skipped (zero-shot learner), and None is returned. + + Returns: + On `test=True`, a list of dicts [{'parent': predicted_label, 'child': child}, ...]. + On `test=False`, returns None. """ if not test: if self.verbose: @@ -617,16 +939,22 @@ def _taxonomy_discovery(self, data: Any, test: bool = False) -> Optional[List[Di eval_df.at[idx, "prediction"] = final_label # Return in the format expected by the pipeline - return [{"parent": p, "child": c} for p, c in zip(eval_df["prediction"], eval_df["child"])] + return [ + {"parent": p, "child": c} + for p, c in zip(eval_df["prediction"], eval_df["child"]) + ] def _generate_and_parse(self, child_term: str) -> (str, str): """ Generate a completion for the given child term and extract the raw predicted class using the strict '#[ ... ]#' pattern. - Returns - ------- - (raw_generation_text, parsed_prediction_or_unknown) + Args: + child_term: The child label to classify into one of the fixed classes. + + Returns: + Tuple `(raw_generation_text, parsed_prediction_or_unknown)`, where the second + element is either the text inside '#[ ... ]#' or the string 'unknown'. """ messages = [ {"role": "system", "content": "You are a helpful classifier."}, @@ -654,13 +982,15 @@ def _generate_and_parse(self, child_term: str) -> (str, str): parsed = match.group(1).strip() if match else "unknown" return generation, parsed - # ------------------------------------------------------------------------- - # Normalization helpers - # ------------------------------------------------------------------------- - def _normalize_substring_only(self, text: str) -> str: """ Snap to a label if the string is equal to / contained in / contains a valid label (case-insensitive). + + Args: + text: Raw class text to normalize. + + Returns: + One of `CLASS_LIST` on a match; otherwise 'unknown'. """ if not isinstance(text, str): return "unknown" @@ -670,13 +1000,23 @@ def _normalize_substring_only(self, text: str) -> str: for label in self.CLASS_LIST: label_lower = label.lower() - if lowered == label_lower or lowered in label_lower or label_lower in lowered: + if ( + lowered == label_lower + or lowered in label_lower + or label_lower in lowered + ): return label return "unknown" def _normalize_levenshtein_only(self, text: str) -> str: """ Snap to the nearest label by Levenshtein (edit) distance. + + Args: + text: Raw class text to normalize. + + Returns: + The nearest label in `CLASS_LIST`, or 'unknown' if input is empty/invalid. """ if not isinstance(text, str): return "unknown" @@ -697,37 +1037,59 @@ def _normalize_levenshtein_only(self, text: str) -> str: def _normalize_auto(self, text: str) -> str: """ Cascade: try substring-first; if no match, fall back to Levenshtein snapping. + + Args: + text: Raw class text to normalize. + + Returns: + Normalized label string or 'unknown'. """ snapped = self._normalize_substring_only(text) - return snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + return ( + snapped if snapped != "unknown" else self._normalize_levenshtein_only(text) + ) - def _to_dataframe(data: Any) -> pd.DataFrame: + def _to_dataframe(self, data: Any) -> pd.DataFrame: """ - Normalize various input formats into a DataFrame with columns: - ['child', 'parent'] or ['child', 'parent', 'label']. + Normalize various input formats into a DataFrame. + + Supported inputs: + * pandas.DataFrame with columns ['child','parent',('label')] + * list[dict] with keys 'child','parent',('label') + * list of tuples/lists: (child, parent) or (child, parent, label) + * Ontology-like object with `.type_taxonomies.taxonomies` + + Args: + data: The source object to normalize. + + Returns: + A pandas DataFrame with standardized columns. + + Raises: + ValueError: If the input type/shape is not recognized. """ - # Already a DataFrame if isinstance(data, pd.DataFrame): df = data.copy() df.columns = [str(c).lower() for c in df.columns] return df.reset_index(drop=True) - # List[dict] if isinstance(data, list) and data and isinstance(data[0], dict): rows = [{str(k).lower(): v for k, v in d.items()} for d in data] return pd.DataFrame(rows).reset_index(drop=True) - # Iterable of tuples/lists: (child, parent[, label]) if isinstance(data, (list, tuple)) and data: first = data[0] if isinstance(first, (list, tuple)) and not isinstance(first, dict): n = len(first) if n >= 3: - return pd.DataFrame(data, columns=["child", "parent", "label"]).reset_index(drop=True) + return pd.DataFrame( + data, columns=["child", "parent", "label"] + ).reset_index(drop=True) if n == 2: - return pd.DataFrame(data, columns=["child", "parent"]).reset_index(drop=True) + return pd.DataFrame(data, columns=["child", "parent"]).reset_index( + drop=True + ) - # OntoLearner-style object (with .type_taxonomies.taxonomies) try: type_taxonomies = getattr(data, "type_taxonomies", None) if type_taxonomies is not None: @@ -737,9 +1099,15 @@ def _to_dataframe(data: Any) -> pd.DataFrame: for rel in taxonomies: parent = getattr(rel, "parent", None) child = getattr(rel, "child", None) - label = getattr(rel, "label", None) if hasattr(rel, "label") else None + label = ( + getattr(rel, "label", None) + if hasattr(rel, "label") + else None + ) if parent is not None and child is not None: - rows.append({"child": child, "parent": parent, "label": label}) + rows.append( + {"child": child, "parent": parent, "label": label} + ) if rows: return pd.DataFrame(rows).reset_index(drop=True) except Exception: @@ -751,10 +1119,19 @@ def _to_dataframe(data: Any) -> pd.DataFrame: ".type_taxonomies.taxonomies." ) - def _resolve_save_path(save_path: str, default_filename: str) -> str: + def _resolve_save_path(self, save_path: str, default_filename: str) -> str: """ - If `save_path` is a directory, join it with `default_filename`. - If it's a file path, return as-is. + Resolve a target file path from a directory or path-like input. + + If `save_path` points to a directory, joins it with `default_filename`. + If it already looks like a file path (has an extension), returns as-is. + + Args: + save_path: Directory or file path supplied by the caller. + default_filename: Basename to use when `save_path` is a directory. + + Returns: + A concrete file path where outputs can be written. """ base = os.path.basename(save_path) has_ext = os.path.splitext(base)[1] != "" diff --git a/ontolearner/learner/term_typing/__init__.py b/ontolearner/learner/term_typing/__init__.py deleted file mode 100644 index a42d716..0000000 --- a/ontolearner/learner/term_typing/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .rwthdbis import RWTHDBISSFTLearner -from .sbunlp import SBUNLPZSLearner -from .alexbek import AlexbekRFLearner, AlexbekRAGLearner diff --git a/ontolearner/learner/term_typing/alexbek.py b/ontolearner/learner/term_typing/alexbek.py index 7aa6033..0db694b 100644 --- a/ontolearner/learner/term_typing/alexbek.py +++ b/ontolearner/learner/term_typing/alexbek.py @@ -12,6 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Learners for supervised and retrieval-augmented *term typing*. + +This module implements two learners: + +- **AlexbekRFLearner** (retriever/classifier): + Encodes terms with a Hugging Face encoder, optionally augments with simple + graph features, and trains a One-vs-Rest RandomForest for multi-label typing. + +- **AlexbekRAGLearner** (retrieval-augmented generation): + Builds an in-memory example index with sentence embeddings, retrieves + nearest examples for each query term, then prompts an instruction-tuned + causal LLM to produce types, parsing the JSON response. + +Both learners conform to the `AutoLearner` / `AutoRetriever` APIs used in +the outer pipeline. +""" + import gc import json import re @@ -31,22 +48,19 @@ from ...base import AutoLearner, AutoRetriever + class AlexbekRFLearner(AutoRetriever): """ Embedding-based multi-label classifier for *term typing*. - Pipeline overview: - 1) Load a Hugging Face encoder (tokenizer + model). - 2) Encode input terms into sentence embeddings. - 3) Optionally augment with simple graph (co-occurrence) features. - 4) Train a One-vs-Rest RandomForest on the concatenated features. - 5) Predict multi-label types with a probability threshold (fallback to top-1). - - API expected by LearnerPipeline: - - load(model_id) - - fit(data, task, ontologizer=True) - - predict(data, task, ontologizer=True) - - tasks_ground_truth_former(data, task) + Pipeline + 1) Load a Hugging Face encoder (tokenizer + model). + 2) Encode input terms into sentence embeddings. + 3) Optionally augment with simple graph (co-occurrence) features. + 4) Train a One-vs-Rest RandomForest on the concatenated features. + 5) Predict multi-label types with a probability threshold (fallback to top-1). + + Implements the `AutoRetriever` interface used by the outer pipeline. """ def __init__( @@ -58,6 +72,23 @@ def __init__( use_graph_features: bool = True, rf_kwargs: Optional[Dict[str, Any]] = None, ): + """Configure the RF-based multi-label learner. + + Parameters + device: + Torch device spec ('cpu' or 'cuda'). + batch_size: + Encoding mini-batch size for the transformer. + max_length: + Maximum input token length for the encoder tokenizer. + threshold: + Per-label probability threshold at prediction time. + use_graph_features: + If True, add simple graph features to embeddings. + rf_kwargs: + Optional RandomForest hyperparameters dictionary. + + """ # Runtime / inference settings self.device = torch.device(device) self.batch_size = batch_size @@ -81,21 +112,50 @@ def __init__( self.term_graph: Optional[nx.Graph] = None def load(self, model_id: str, **_: Any) -> None: - """Load a Hugging Face encoder by model id (tokenizer + base model).""" + """Load a Hugging Face encoder by model id (tokenizer + base model). + + Parameters + model_id: + HF model identifier or local path for an encoder backbone. + + Side Effects + - Sets `self.model_name`, `self.tokenizer`, `self.embedding_model`. + - Puts the model in eval mode and moves it to `self.device`. + """ self.model_name = model_id self.tokenizer = AutoTokenizer.from_pretrained(model_id) self.embedding_model = AutoModel.from_pretrained(model_id) self.embedding_model.eval().to(self.device) def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: - """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features).""" + """Train the One-vs-Rest RandomForest on term embeddings (+ optional graph features). + + Parameters + data: + Training payload; supported formats are routed via `_as_term_types_dicts`. + Each example must contain at least `{"term": str, "types": List[str]}`. + task: + Must be `'term-typing'`. + ontologizer: + Unused here; accepted for API compatibility. + **_: + Ignored extra arguments. + + Raises + ValueError + If `task` is not `'term-typing'` or if no valid examples are found. + """ if task != "term-typing": - raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) # Normalize incoming training data into a list of dicts: {term, types, RAG} training_rows = self._as_term_types_dicts(data) if not training_rows: - raise ValueError("No valid training examples found (need 'term' and 'types').") + raise ValueError( + "No valid training examples found (need 'term' and 'types')." + ) # Split out terms and raw labels training_terms: List[str] = [row["term"] for row in training_rows] @@ -110,7 +170,9 @@ def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: # Optionally build a light-weight co-occurrence graph and extract features if self.use_graph_features: self.term_graph = self._create_term_graph(training_rows) - graph_features_train = self._extract_graph_features(self.term_graph, training_terms) + graph_features_train = self._extract_graph_features( + self.term_graph, training_terms + ) X_train = np.hstack([term_embeddings_train, graph_features_train]) else: self.term_graph = None @@ -120,18 +182,48 @@ def fit(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> None: Y_train = self.label_binarizer.transform(raw_label_lists) # One-vs-Rest RandomForest (one binary RF per label) - self.ovr_random_forest = OneVsRestClassifier(RandomForestClassifier(**self.rf_kwargs)) + self.ovr_random_forest = OneVsRestClassifier( + RandomForestClassifier(**self.rf_kwargs) + ) self.ovr_random_forest.fit(X_train, Y_train) - - def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> List[Dict[str, Any]]: + def predict( + self, data: Any, task: str, ontologizer: bool = True, **_: Any + ) -> List[Dict[str, Any]]: """Predict multi-label types for input terms. - Returns a list of dicts with keys: {id, term, types}. + Parameters + data: + Evaluation payload; formats normalized by `_as_predict_terms_ids`. + task: + Must be `'term-typing'`. + ontologizer: + Unused here; accepted for API compatibility. + **_: + Ignored extra arguments. + + Returns + List[Dict[str, Any]] + A list of dictionaries with keys: + - `id`: Original example id (if provided). + - `term`: Input term string. + - `types`: List of predicted label strings (selected by threshold or top-1). + + Raises + ValueError + If `task` is not `'term-typing'`. + RuntimeError + If `load()` and `fit()` have not been called. """ if task != "term-typing": - raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") - if self.ovr_random_forest is None or self.tokenizer is None or self.embedding_model is None: + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) + if ( + self.ovr_random_forest is None + or self.tokenizer is None + or self.embedding_model is None + ): raise RuntimeError("Call load() and fit() before predict().") # Normalize prediction input into parallel lists of terms and example ids @@ -142,7 +234,9 @@ def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> L # Match feature layout used during training if self.use_graph_features and self.term_graph is not None: - graph_features_test = self._extract_graph_features(self.term_graph, test_terms) + graph_features_test = self._extract_graph_features( + self.term_graph, test_terms + ) X_test = np.hstack([term_embeddings_test, graph_features_test]) else: X_test = term_embeddings_test @@ -160,7 +254,9 @@ def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> L if len(selected_label_indices) == 0: selected_label_indices = [int(np.argmax(label_probabilities))] - predicted_types = [label_names[label_idx] for label_idx in selected_label_indices] + predicted_types = [ + label_names[label_idx] for label_idx in selected_label_indices + ] predictions.append( { @@ -172,20 +268,49 @@ def predict(self, data: Any, task: str, ontologizer: bool = True, **_: Any) -> L return predictions def tasks_ground_truth_former(self, data: Any, task: str) -> List[Dict[str, Any]]: - """Normalize ground-truth into a list of {id, term, types} dicts for evaluation.""" + """Normalize ground-truth into a list of {id, term, types} dicts for evaluation. + + Parameters + data: + Ground-truth payload; supported formats include objects exposing + `.term_typings`, a list of dicts, or a list of tuples/lists. + task: + Must be `'term-typing'`. + + Returns + List[Dict[str, Any]] + A list of dictionaries with keys `id`, `term`, `types` (list of str). + + Raises + ValueError + If `task` is not `'term-typing'`. + """ if task != "term-typing": - raise ValueError("OntologyTypeRFClassifier supports only task='term-typing'.") + raise ValueError( + "OntologyTypeRFClassifier supports only task='term-typing'." + ) return self._as_gold_id_term_types(data) def _encode(self, texts: List[str]) -> np.ndarray: - """Encode a list of strings into L2-normalized sentence embeddings (NumPy array). + """Encode a list of strings into L2-normalized sentence embeddings. - If no texts are provided, returns an empty array with width equal to the model hidden size. + Parameters + texts: + List of input texts/terms. + + Returns + np.ndarray + Array of shape `(len(texts), hidden_size)` with L2-normalized + embeddings. If `texts` is empty, returns a `(0, hidden_size)` array. """ - assert self.tokenizer is not None and self.embedding_model is not None, "Call load(model_id) first." + assert self.tokenizer is not None and self.embedding_model is not None, ( + "Call load(model_id) first." + ) if not texts: - hidden_size = getattr(getattr(self.embedding_model, "config", None), "hidden_size", 768) + hidden_size = getattr( + getattr(self.embedding_model, "config", None), "hidden_size", 768 + ) return np.zeros((0, hidden_size), dtype=np.float32) batch_embeddings: List[torch.Tensor] = [] @@ -208,11 +333,15 @@ def _encode(self, texts: List[str]) -> np.ndarray: model_output = self.embedding_model(**tokenized_batch) # Prefer dedicated pooler if provided; otherwise pool by last valid token - if hasattr(model_output, "pooler_output") and model_output.pooler_output is not None: + if ( + hasattr(model_output, "pooler_output") + and model_output.pooler_output is not None + ): sentence_embeddings = model_output.pooler_output else: sentence_embeddings = self._last_token_pool( - model_output.last_hidden_state, tokenized_batch["attention_mask"] + model_output.last_hidden_state, + tokenized_batch["attention_mask"], ) # L2-normalize embeddings for stability @@ -230,18 +359,44 @@ def _encode(self, texts: List[str]) -> np.ndarray: # Concatenate all batches and convert to NumPy return torch.cat(batch_embeddings, dim=0).numpy() - def _last_token_pool(self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: - """Select the last *non-padding* token embedding for each sequence in the batch.""" + def _last_token_pool( + self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Select the last *non-padding* token embedding for each sequence. + + Parameters + last_hidden_states: + Tensor of shape `(batch, seq_len, hidden)`. + attention_mask: + Tensor of shape `(batch, seq_len)` with 1 for real tokens. + + Returns + torch.Tensor + Tensor of shape `(batch, hidden)` with per-sequence pooled embeddings. + """ last_valid_token_idx = attention_mask.sum(dim=1) - 1 # (batch,) - batch_row_idx = torch.arange(last_hidden_states.size(0), device=last_hidden_states.device) + batch_row_idx = torch.arange( + last_hidden_states.size(0), device=last_hidden_states.device + ) return last_hidden_states[batch_row_idx, last_valid_token_idx] def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: """Create a simple undirected co-occurrence graph from training rows. - Nodes: terms (with node attribute 'types'). - Edges: between a term and each neighbor from its optional RAG list. - Edge weight = number of shared types (or 0.1 if none shared). + Graph Structure + Nodes + Terms (node attribute `'types'` is stored per term). + Edges + Between a term and each neighbor from its optional RAG list. + Edge weight = number of shared types (or 0.1 if none shared). + + Parameters + training_rows: + Normalized rows with keys: `'term'`, `'types'`, optional `'RAG'`. + + Returns + networkx.Graph + The constructed undirected graph. """ graph = nx.Graph() @@ -251,7 +406,7 @@ def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: graph.add_node(term, types=term_types) # RAG may be a list of neighbor dicts like {"term": ..., "types": [...]} - for neighbor in (row.get("RAG", []) or []): + for neighbor in row.get("RAG", []) or []: neighbor_term = neighbor.get("term") neighbor_types = neighbor.get("types", []) @@ -263,12 +418,24 @@ def _create_term_graph(self, training_rows: List[Dict[str, Any]]) -> nx.Graph: return graph - def _extract_graph_features(self, term_graph: nx.Graph, terms: List[str]) -> np.ndarray: + def _extract_graph_features( + self, term_graph: nx.Graph, terms: List[str] + ) -> np.ndarray: """Compute simple per-term graph features. + Feature Vector For each term we compute a 4-dim vector: - [degree, clustering_coefficient, degree_centrality, pagerank_score] - Returns an array of shape [len(terms), 4]. + `[degree, clustering_coefficient, degree_centrality, pagerank_score]` + + Parameters + term_graph: + Graph built over training terms. + terms: + List of term strings to extract features for. + + Returns + np.ndarray + Array of shape `(len(terms), 4)` (dtype float32). """ if len(term_graph): degree_centrality = nx.degree_centrality(term_graph) @@ -293,7 +460,26 @@ def _extract_graph_features(self, term_graph: nx.Graph, terms: List[str]) -> np. return np.asarray(feature_rows, dtype=np.float32) def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: - """Normalize diverse training data formats to a list of dicts: {term, types, RAG}.""" + """Normalize diverse training data formats to a list of dicts: {term, types, RAG}. + + Supported Inputs + - Object with attribute `.term_typings` (iterable of items exposing + `.term`, `.types`, optional `.RAG`). + - List of dicts with keys `term`, `types`, optional `RAG`. + - List/tuple of `(term, types[, RAG])`. + + Parameters + data: + Training payload. + + Returns + List[Dict[str, Any]] + Normalized dictionaries ready for training. + + Raises + ValueError + If `data` is neither a list/tuple nor exposes `.term_typings`. + """ normalized_rows: List[Dict[str, Any]] = [] # Case 1: object with attribute `.term_typings` @@ -308,13 +494,19 @@ def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: if not isinstance(type_list, list): type_list = [type_list] normalized_rows.append( - {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } ) return normalized_rows # Otherwise: must be a list/tuple-like container if not isinstance(data, (list, tuple)): - raise ValueError("Training data must be a list/tuple or expose .term_typings") + raise ValueError( + "Training data must be a list/tuple or expose .term_typings" + ) if not data: return normalized_rows @@ -330,7 +522,11 @@ def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: if not isinstance(type_list, list): type_list = [type_list] normalized_rows.append( - {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } ) return normalized_rows @@ -345,13 +541,36 @@ def _as_term_types_dicts(self, data: Any) -> List[Dict[str, Any]]: if not isinstance(type_list, list): type_list = [type_list] normalized_rows.append( - {"term": str(term_text), "types": [str(x) for x in type_list], "RAG": rag_neighbors} + { + "term": str(term_text), + "types": [str(x) for x in type_list], + "RAG": rag_neighbors, + } ) return normalized_rows def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: - """Normalize prediction input into parallel lists: (terms, ids).""" + """Normalize prediction input into parallel lists: (terms, ids). + + Supported Inputs + - Object with `.term_typings`. + - List of dicts with `term` and optional `id`. + - List of tuples/lists `(term, id[, ...])`. + - List of plain term strings. + + Parameters + data: + Evaluation payload. + + Returns + Tuple[List[str], List[Any]] + `(terms, example_ids)` lists aligned by index. + + Raises + ValueError + If the input format is unsupported. + """ terms: List[str] = [] example_ids: List[Any] = [] @@ -392,7 +611,20 @@ def _as_predict_terms_ids(self, data: Any) -> Tuple[List[str], List[Any]]: raise ValueError("Unsupported predict() input format.") def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: - """Normalize gold labels into a list of dicts: {id, term, types}.""" + """Normalize gold labels into a list of dicts: {id, term, types}. + + Supported Inputs + Mirrors `_as_term_types_dicts`, but ensures an `id` is set. + + Parameters + data: + Ground-truth payload. + + Returns + List[Dict[str, Any]] + `{'id': Any, 'term': str, 'types': List[str]}` entries. + + """ gold_rows: List[Dict[str, Any]] = [] # Case 1: object with attribute `.term_typings` @@ -404,7 +636,13 @@ def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: type_list = getattr(item, "types", []) if not isinstance(type_list, list): type_list = [type_list] - gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) return gold_rows # Case 2: list/tuple container @@ -419,7 +657,13 @@ def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: type_list = row.get("types", []) if not isinstance(type_list, list): type_list = [type_list] - gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) return gold_rows # 2b) list of tuples/lists: (term, types[, id]) @@ -432,35 +676,68 @@ def _as_gold_id_term_types(self, data: Any) -> List[Dict[str, Any]]: gold_id = tuple_row[2] if len(tuple_row) > 2 else i if not isinstance(type_list, list): type_list = [type_list] - gold_rows.append({"id": gold_id, "term": term_text, "types": [str(t) for t in type_list]}) + gold_rows.append( + { + "id": gold_id, + "term": term_text, + "types": [str(t) for t in type_list], + } + ) return gold_rows - raise ValueError("Unsupported ground-truth input format for tasks_ground_truth_former().") + raise ValueError( + "Unsupported ground-truth input format for tasks_ground_truth_former()." + ) + class AlexbekRAGLearner(AutoLearner): """Retrieval-Augmented Term Typing learner (single task: term-typing). - Flow: - 1) fit: collect (term -> [types]) examples, build an in-memory index - using a sentence-embedding model. - 2) predict: for each new term, retrieve top-k similar examples, compose a - structured prompt, query an instruction-tuned causal LLM, and parse types. + Flow + 1) `fit`: collect (term -> [types]) examples, build an in-memory index + using a sentence-embedding model. + 2) `predict`: for each new term, retrieve top-k similar examples, compose a + structured prompt, query an instruction-tuned causal LLM, and parse types. - Returns a list of dicts: {"term": str, "types": List[str], "id": Optional[str]}. + Returns + List[Dict[str, Any]] + `{"term": str, "types": List[str], "id": Optional[str]}` rows. """ def __init__( self, llm_model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", retriever_model_id: str = "sentence-transformers/all-MiniLM-L6-v2", - device: str = "auto", # "auto" | "cuda" | "cpu" - token: str = "", # HF token if needed + device: str = "auto", # "auto" | "cuda" | "cpu" + token: str = "", # HF token if needed top_k: int = 3, max_new_tokens: int = 256, - gen_batch_size: int = 4, # generation batch size + gen_batch_size: int = 4, # generation batch size enc_batch_size: int = 64, # embedding batch size - **kwargs: Any, # absorb extra pipeline-style args + **kwargs: Any, # absorb extra pipeline-style args ) -> None: + """Configure the RAG learner. + + Parameters + llm_model_id: + HF model id/path for the instruction-tuned causal LLM. + retriever_model_id: + Sentence-embedding model id for retrieval. + device: + Device policy ('auto'|'cuda'|'cpu') for the LLM. + token: + Optional HF token for gated models. + top_k: + Number of nearest examples to retrieve per query term. + max_new_tokens: + Decoding budget for the LLM. + gen_batch_size: + Number of prompts per generation batch. + enc_batch_size: + Number of texts per embedding batch. + **kwargs: + Extra configuration captured for downstream use. + """ super().__init__() # Consolidated configuration for simple serialization @@ -482,7 +759,7 @@ def __init__( # Retriever components self.embedder: Optional[SentenceTransformer] = None - self.indexed_corpus: List[str] = [] # items: " || [...]" + self.indexed_corpus: List[str] = [] # items: " || [...]" self.corpus_embeddings: Optional[torch.Tensor] = None # Training cache of (term, [types]) tuples @@ -497,15 +774,13 @@ def __init__( "2) Be concise. Respond ONLY in JSON using double quotes.\n" 'Format: {"term":"...", "reasoning":"<<=100 words>>", "types":["...", "..."]}\n' ) - self._user_prompt_template: str = ( - """{examples} + self._user_prompt_template: str = """{examples} TERM: {term} TASK: Determine semantic types for the given term based on the domain ontology. Remember: types are generalizing categories, not the term itself. Respond in JSON. """ - ) def load( self, @@ -515,7 +790,21 @@ def load( token: Optional[str] = None, **kwargs: Any, ) -> None: - """Load the LLM and the embedding retriever. Overrides constructor values if provided.""" + """Load the LLM and the embedding retriever. Overrides constructor values if provided. + + Parameters + model_id: + Optional override for the LLM model id. + retriever_id: + Optional override for the embedding model id. + device: + Optional override for device selection policy. + token: + Optional override for HF token. + **kwargs: + Extra values to store in `extra_cfg`. + + """ if model_id is not None: self.cfg["llm_model_id"] = model_id if retriever_id is not None: @@ -556,10 +845,26 @@ def load( generation_cfg.num_beams = 1 # Retriever - self.embedder = SentenceTransformer(self.cfg["retriever_model_id"], trust_remote_code=True) + self.embedder = SentenceTransformer( + self.cfg["retriever_model_id"], trust_remote_code=True + ) def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: - """Prepare the retrieval index from training examples.""" + """Prepare the retrieval index from training examples. + + Parameters + train_data: + Training payload containing terms and their types. + task: + Must be `'term-typing'`; other tasks are forwarded to base. + ontologizer: + Unused flag for API compatibility. + + Side Effects + - Normalizes to a list of `(term, [types])`. + - Builds an indexable text corpus and (if embedder is loaded) + computes embeddings for retrieval. + """ if task != "term-typing": return super().fit(train_data, task, ontologizer) @@ -568,7 +873,8 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: # Build the textual corpus to index self.indexed_corpus = [ - f"{term} || {json.dumps(types, ensure_ascii=False)}" for term, types in self.train_term_types + f"{term} || {json.dumps(types, ensure_ascii=False)}" + for term, types in self.train_term_types ] # Embed the corpus if available; else fall back to zero-shot prompting @@ -578,7 +884,23 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True) -> None: self.corpus_embeddings = None def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: - """Predict types for evaluation items; returns a list of {term, types, id?}.""" + """Predict types for evaluation items; returns a list of {term, types, id?}. + + Parameters + eval_data: + Evaluation payload to type (terms + optional ids). + task: + Must be `'term-typing'`; other tasks are forwarded to base. + ontologizer: + Unused flag for API compatibility. + + Returns + List[Dict[str, Any]] + For each input term, a dictionary with keys: + - `term`: The input term. + - `types`: A (unique, sorted) list of predicted types. + - `id`: Optional example id (if provided in input). + """ if task != "term-typing": return super().predict(eval_data, task, ontologizer) @@ -588,11 +910,15 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: # Use RAG if we have an indexed corpus & embeddings; otherwise zero-shot rag_available = ( - self.corpus_embeddings is not None and self.embedder is not None and len(self.indexed_corpus) > 0 + self.corpus_embeddings is not None + and self.embedder is not None + and len(self.indexed_corpus) > 0 ) if rag_available: - neighbor_docs_per_query = self._retrieve_batch(eval_terms, top_k=int(self.cfg["top_k"])) + neighbor_docs_per_query = self._retrieve_batch( + eval_terms, top_k=int(self.cfg["top_k"]) + ) else: neighbor_docs_per_query = [[] for _ in eval_terms] @@ -608,7 +934,9 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: # Build standardized results results: List[Dict[str, Any]] = [] - for term, example_id, predicted_types in zip(eval_terms, eval_ids, predicted_types_lists): + for term, example_id, predicted_types in zip( + eval_terms, eval_ids, predicted_types_lists + ): result_row: Dict[str, Any] = { "term": term, "types": sorted({t for t in predicted_types}), # unique + sorted @@ -617,11 +945,28 @@ def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: result_row["id"] = example_id results.append(result_row) - assert all(("term" in row and "types" in row) for row in results), "predict() must return term + types" + assert all(("term" in row and "types" in row) for row in results), ( + "predict() must return term + types" + ) return results def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: - """Extract (term, [types]) tuples from supported training payloads.""" + """Extract `(term, [types])` tuples from supported training payloads. + + Supported Inputs + - `data.term_typings` (objects exposing `.term` & `.types`) + - `list[dict]` with keys `'term'` and `'types'` + - `list[str]` → returns empty (nothing to index) + - other formats → empty + + Parameters + data: + Training payload. + + Returns + List[Tuple[str, List[str]]] + (term, types) tuples (types kept as strings). + """ term_typings = getattr(data, "term_typings", None) if term_typings is not None: parsed_pairs: List[Tuple[str, List[str]]] = [] @@ -629,7 +974,9 @@ def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: term = getattr(item, "term", None) types = list(getattr(item, "types", []) or []) if term and types: - parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + parsed_pairs.append( + (term, [t for t in types if isinstance(t, str)]) + ) return parsed_pairs if isinstance(data, list) and data and isinstance(data[0], dict): @@ -638,17 +985,35 @@ def _unpack_train(self, data: Any) -> List[Tuple[str, List[str]]]: term = row.get("term") types = row.get("types") or [] if term and isinstance(types, list) and types: - parsed_pairs.append((term, [t for t in types if isinstance(t, str)])) + parsed_pairs.append( + (term, [t for t in types if isinstance(t, str)]) + ) return parsed_pairs # If only a list of strings is provided, there's nothing to index for RAG - if isinstance(data, (list, set, tuple)) and all(isinstance(x, str) for x in data): + if isinstance(data, (list, set, tuple)) and all( + isinstance(x, str) for x in data + ): return [] return [] def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: - """Extract (terms, ids) from supported evaluation payloads.""" + """Extract `(terms, ids)` from supported evaluation payloads. + + Supported Inputs + - `data.term_typings` (objects exposing `.term` & optional `.id`) + - `list[str]` + - `list[dict]` with `term` and optional `id` + + Parameters + data: + Evaluation payload. + + Returns + Tuple[List[str], List[Optional[str]]] + Two lists aligned by index: terms and ids (ids may contain `None`). + """ term_typings = getattr(data, "term_typings", None) if term_typings is not None: terms: List[str] = [] @@ -672,24 +1037,50 @@ def _unpack_eval(self, data: Any) -> Tuple[List[str], List[Optional[str]]]: return [], [] def _encode_texts(self, texts: List[str]) -> torch.Tensor: - """Encode a batch of texts with the sentence-embedding model.""" + """Encode a batch of texts with the sentence-embedding model. + + Parameters + texts: + List of strings to embed. + + Returns + torch.Tensor + Tensor of shape `(len(texts), hidden_dim)`. If `texts` is empty, + returns an empty tensor with 0 rows. + """ batch_size = int(self.cfg["enc_batch_size"]) batch_embeddings: List[torch.Tensor] = [] for batch_start in range(0, len(texts), batch_size): batch_texts = texts[batch_start : batch_start + batch_size] - embeddings = self.embedder.encode(batch_texts, convert_to_tensor=True, show_progress_bar=False) + embeddings = self.embedder.encode( + batch_texts, convert_to_tensor=True, show_progress_bar=False + ) batch_embeddings.append(embeddings) - return torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + return ( + torch.cat(batch_embeddings, dim=0) if batch_embeddings else torch.empty(0) + ) def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: - """Return for each query the top-k most similar corpus entries (as raw text rows).""" + """Return for each query the top-k most similar corpus entries. + + Parameters + queries: + List of query terms. + top_k: + Number of neighbors to retrieve for each query. + + Returns + List[List[str]] + For each query, a list of raw corpus strings formatted as + `" || [\\"type1\\", ...]"`. + """ if self.corpus_embeddings is None or not self.indexed_corpus: return [[] for _ in queries] - query_embeddings = self._encode_texts(queries) # [Q, D] - doc_embeddings = self.corpus_embeddings # [N, D] + query_embeddings = self._encode_texts(queries) # [Q, D] + doc_embeddings = self.corpus_embeddings # [N, D] if query_embeddings.shape[-1] != doc_embeddings.shape[-1]: raise ValueError( f"Embedding dim mismatch: {query_embeddings.shape[-1]} vs {doc_embeddings.shape[-1]}" @@ -705,7 +1096,16 @@ def _retrieve_batch(self, queries: List[str], top_k: int) -> List[List[str]]: return [[self.indexed_corpus[j] for j in row.tolist()] for row in top_indices] def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: - """Parse raw corpus rows ('term || [types]') into (term, [types]) pairs.""" + """Parse raw corpus rows ('term || [types]') into `(term, [types])` pairs. + + Parameters + docs: + Raw strings from the index/corpus. + + Returns + List[Tuple[str, List[str]]] + Parsed (term, types) pairs; malformed rows are skipped. + """ example_pairs: List[Tuple[str, List[str]]] = [] for raw_row in docs: try: @@ -713,13 +1113,24 @@ def _decode_examples(self, docs: List[str]) -> List[Tuple[str, List[str]]]: term = term_raw.strip() types_list = json.loads(types_json.strip()) if isinstance(types_list, list): - example_pairs.append((term, [t for t in types_list if isinstance(t, str)])) + example_pairs.append( + (term, [t for t in types_list if isinstance(t, str)]) + ) except Exception: continue return example_pairs def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: - """Format retrieved example pairs into a compact block for the prompt.""" + """Format retrieved example pairs into a compact block for the prompt. + + Parameters + pairs: + Retrieved `(term, [types])` examples. + + Returns + str + Human-readable lines to provide *light* guidance to the LLM. + """ if not pairs: return "EXAMPLES: (none provided)" lines: List[str] = ["CLASSIFICATION EXAMPLES:"] @@ -730,12 +1141,34 @@ def _format_examples(self, pairs: List[Tuple[str, List[str]]]) -> str: return "\n".join(lines) def _compose_prompt(self, examples_block: str, term: str) -> str: - """Compose the final prompt from system + user blocks.""" - user_block = self._user_prompt_template.format(examples=examples_block, term=term) + """Compose the final prompt from system + user blocks. + + Parameters + examples_block: + Text block with retrieved examples. + term: + The query term to classify. + + Returns + str + Full prompt string passed to the LLM. + """ + user_block = self._user_prompt_template.format( + examples=examples_block, term=term + ) return f"{self._system_prompt}\n\n{user_block}\n" def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: - """Run generation for a batch of prompts and parse the JSON 'types' from outputs.""" + """Run generation for a batch of prompts and parse the JSON `'types'` from outputs. + + Parameters + prompts: + Finalized prompts for the LLM. + + Returns + List[List[str]] + For each prompt, a list of predicted type strings. + """ batch_size = int(self.cfg["gen_batch_size"]) all_predicted_types: List[List[str]] = [] @@ -744,7 +1177,9 @@ def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: # Tokenize and move to the LLM's device model_device = getattr(self.generation_model, "device", None) - encodings = self.tokenizer(prompt_batch, return_tensors="pt", padding=True).to(model_device) + encodings = self.tokenizer( + prompt_batch, return_tensors="pt", padding=True + ).to(model_device) input_token_length = encodings["input_ids"].shape[1] # Deterministic decoding (greedy) @@ -762,9 +1197,14 @@ def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: # Slice off the prompt tokens and decode only newly generated tokens new_token_span = generated_tokens[:, input_token_length:] - decoded_texts = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in new_token_span] - - parsed_types_per_prompt = [self._parse_types(text) for text in decoded_texts] + decoded_texts = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in new_token_span + ] + + parsed_types_per_prompt = [ + self._parse_types(text) for text in decoded_texts + ] all_predicted_types.extend(parsed_types_per_prompt) return all_predicted_types @@ -772,11 +1212,19 @@ def _generate_and_parse(self, prompts: List[str]) -> List[List[str]]: def _parse_types(self, text: str) -> List[str]: """Extract a list of type strings from LLM output. - Attempts (in order): - 1) Strict JSON object with "types". - 2) Regex-extract JSON object containing "types". - 3) Regex-extract first bracketed list. - 4) Comma-split fallback. + Parsing Strategy (in order) + 1) Strict JSON object with `"types"`. + 2) Regex-extract JSON object containing `"types"`. + 3) Regex-extract first bracketed list. + 4) Comma-split fallback. + + Parameters + text: + Raw LLM output to parse. + + Returns + List[str] + Parsed list of type strings (possibly empty if parsing fails). """ try: obj = json.loads(text) @@ -786,7 +1234,9 @@ def _parse_types(self, text: str) -> List[str]: pass try: - obj_match = re.search(r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S) + obj_match = re.search( + r'\{[^{}]*"types"\s*:\s*\[[^\]]*\][^{}]*\}', text, re.S + ) if obj_match: obj = json.loads(obj_match.group(0)) types = obj.get("types", []) @@ -795,9 +1245,12 @@ def _parse_types(self, text: str) -> List[str]: pass try: - list_match = re.search(r'\[([^\]]+)\]', text) + list_match = re.search(r"\[([^\]]+)\]", text) if list_match: - items = [x.strip().strip('"').strip("'") for x in list_match.group(1).split(",")] + items = [ + x.strip().strip('"').strip("'") + for x in list_match.group(1).split(",") + ] return [t for t in items if t] except Exception: pass diff --git a/ontolearner/learner/term_typing/rwthdbis.py b/ontolearner/learner/term_typing/rwthdbis.py index f27fd56..c8df797 100644 --- a/ontolearner/learner/term_typing/rwthdbis.py +++ b/ontolearner/learner/term_typing/rwthdbis.py @@ -27,10 +27,10 @@ TrainingArguments, set_seed, ) -from transformers import DebertaV2Tokenizer from ...base import AutoLearner + class RWTHDBISSFTLearner(AutoLearner): """ Supervised term-typing @@ -44,6 +44,7 @@ def __init__( model_name: str = "microsoft/deberta-v3-small", trained_model_path: Optional[str] = None, output_dir: Optional[str] = None, + device: str = "cpu", max_length: int = 64, per_device_train_batch_size: int = 16, gradient_accumulation_steps: int = 2, @@ -55,8 +56,35 @@ def __init__( save_total_limit: int = 1, fp16: bool = False, bf16: bool = False, - seed: int = 42 + seed: int = 42, ) -> None: + """Initialize the term-typing learner and configure training defaults. + + Args: + model_name: Backbone HF model identifier (used if `trained_model_path` is None). + trained_model_path: Optional path to a fine-tuned checkpoint for loading. + output_dir: Directory to write checkpoints and tokenizer; defaults to './term_typing'. + device: user-defined argument as 'cuda' or 'cpu'. + max_length: Maximum tokenized sequence length. + per_device_train_batch_size: Per-device batch size during training. + gradient_accumulation_steps: Number of update accumulation steps. + num_train_epochs: Training epochs. + learning_rate: Optimizer learning rate. + weight_decay: Weight decay coefficient. + logging_steps: Logging interval (steps) for the Trainer. + save_strategy: Checkpoint save strategy (e.g., 'epoch', 'steps', 'no'). + save_total_limit: Maximum number of checkpoints to keep. + fp16: Enable mixed precision (FP16) if supported. + bf16: Enable mixed precision (BF16) if supported. + seed: Random seed for reproducibility. + + Side Effects: + Creates `output_dir` if it does not exist. + + Notes: + The learner predicts exactly one label per term at inference time + (argmax over logits). + """ super().__init__() self.model_name = model_name self.trained_model_path = trained_model_path @@ -76,7 +104,7 @@ def __init__( self.bf16 = bf16 self.seed = seed - self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device self.model: Optional[AutoModelForSequenceClassification] = None self.tokenizer: Optional[AutoTokenizer] = None self.id2label: Dict[int, str] = {} @@ -84,44 +112,53 @@ def __init__( def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: """ - train: expects ontology-like object with .term_typings - test: returns List[{"term": str, "types": [str]}] (for evaluator) - """ - if not test: - return self._train_from_term_typings(train_data=data) + Train or run inference for term typing, depending on `test`. - terms = self._collect_eval_terms(data) - return self._predict_structured_output(terms) + When `test=False`, trains on `data.term_typings`. + When `test=True`, predicts labels for provided terms. - def _load_robust_tokenizer(self, backbone: str) -> AutoTokenizer: - try: - return AutoTokenizer.from_pretrained(backbone, use_fast=True) - except Exception as fast_err: - print(f"[tokenizer] Fast tokenizer failed: {fast_err}. Trying DebertaV2Tokenizer (slow)...") + Args: + data: If training, an object with `.term_typings` where each item has + `term` and `types` (list[str]). If testing, either a `List[str]` + of raw term texts or an object with `.term_typings`. + test: If True, runs inference; otherwise trains. - try: - return DebertaV2Tokenizer.from_pretrained(backbone) - except Exception as slow_err: - print(f"[tokenizer] DebertaV2Tokenizer failed: {slow_err}. Trying AutoTokenizer(use_fast=False)...") + Returns: + If `test=True`: a list of dicts like + `[{"term": "", "types": [""]}, ...]`. + If `test=False`: None. - try: - return AutoTokenizer.from_pretrained(backbone, use_fast=False) - except Exception as final_err: - raise RuntimeError( - "Failed to load a tokenizer for this DeBERTa model.\n" - "Try:\n" - " - pip install --upgrade sentencepiece\n" - " - ensure network access for model files\n" - " - clear your HF cache and retry\n" - " - pin versions: transformers==4.43.*, tokenizers<0.20\n" - f"Original error: {final_err}" - ) + Raises: + ValueError: If required fields are missing from `data`. + """ + if test: + terms = self._collect_eval_terms(data) + return self._predict_structured_output(terms) + else: + self._train_from_term_typings(train_data=data) + return None def _expand_multilabel_training_rows( self, term_typings: List[Any] ) -> Tuple[List[str], List[int], Dict[int, str], Dict[str, int]]: """ - From multi-label instances -> (texts, label_ids), and label maps. + Expand multi-label instances into single-label rows and derive label maps. + + Each training instance with fields: + - `term`: str-like + - `types`: list of label strings + is expanded into len(types) rows with the same `term` and individual labels. + + Args: + term_typings: Sequence of objects (e.g., dataclasses) exposing + `.term` and `.types`. + + Returns: + A tuple `(texts, label_ids, id2label, label2id)`: + - texts: Flattened list of term strings (one per label). + - label_ids: Parallel list of integer label ids. + - id2label: Mapping from id -> label string. + - label2id: Mapping from label string -> id. """ label_strings: List[str] = [] for instance in term_typings: @@ -143,18 +180,53 @@ def _expand_multilabel_training_rows( def _collect_eval_terms(self, eval_data: Any) -> List[str]: """ - Accepts List[str] OR object with .term_typings; returns list of term texts. + Collect the list of term texts to predict for evaluation. + + Accepts either: + - A `List[str]` of raw term texts, or + - An object with `.term_typings`, from which `.term` is extracted. + + Args: + eval_data: Input carrier for terms. + + Returns: + List of term strings. + + Raises: + ValueError: If `eval_data` lacks the expected structure. """ if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): terms = eval_data else: term_typings = getattr(eval_data, "term_typings", None) if term_typings is None: - raise ValueError("Provide a List[str] OR an object with .term_typings for test=True.") + raise ValueError( + "Provide a List[str] OR an object with .term_typings for test=True." + ) terms = [str(instance.term) for instance in term_typings] return terms def _train_from_term_typings(self, train_data: Any) -> None: + """Train the term-typing classifier from `.term_typings`. + + Steps: + 1) Seed RNGs for reproducibility. + 2) Expand multi-label examples into single-label rows. + 3) Build HF `DatasetDict`, tokenizer, and data collator. + 4) Initialize `AutoModelForSequenceClassification`. + 5) Train with `Trainer` and save model/tokenizer to `output_dir`. + + Args: + train_data: Object with `.term_typings`; each item exposes + `.term` (text) and `.types` (list[str]). + + Raises: + ValueError: If `train_data` does not provide `.term_typings`. + + Side Effects: + Writes a trained model to `self.output_dir` and updates + `self.id2label` / `self.label2id`. + """ set_seed(self.seed) random.seed(self.seed) torch.manual_seed(self.seed) @@ -165,15 +237,26 @@ def _train_from_term_typings(self, train_data: Any) -> None: if term_typings is None: raise ValueError("train_data must provide .term_typings for term-typing.") - texts, label_ids, self.id2label, self.label2id = self._expand_multilabel_training_rows(term_typings) + texts, label_ids, self.id2label, self.label2id = ( + self._expand_multilabel_training_rows(term_typings) + ) - dataset = DatasetDict({"train": Dataset.from_dict({"labels": label_ids, "text": texts})}) + dataset = DatasetDict( + {"train": Dataset.from_dict({"labels": label_ids, "text": texts})} + ) backbone = self.trained_model_path or self.model_name - self.tokenizer = self._load_robust_tokenizer(backbone) + try: + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) + except Exception: + # fallback if fast tokenizer isn't available + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=False) def tokenize_batch(batch: Dict[str, List[str]]): - return self.tokenizer(batch["text"], truncation=True, max_length=self.max_length) + """Tokenize a batch of texts with truncation and max length.""" + return self.tokenizer( + batch["text"], truncation=True, max_length=self.max_length + ) tokenized = dataset.map(tokenize_batch, batched=True, remove_columns=["text"]) data_collator = DataCollatorWithPadding(self.tokenizer) @@ -185,7 +268,10 @@ def tokenize_batch(batch: Dict[str, List[str]]): label2id=self.label2id, ) - if getattr(self.model.config, "pad_token_id", None) is None and self.tokenizer.pad_token_id is not None: + if ( + getattr(self.model.config, "pad_token_id", None) is None + and self.tokenizer.pad_token_id is not None + ): self.model.config.pad_token_id = self.tokenizer.pad_token_id training_args = TrainingArguments( @@ -216,11 +302,20 @@ def tokenize_batch(batch: Dict[str, List[str]]): self.tokenizer.save_pretrained(self.output_dir) def _ensure_loaded_for_inference(self) -> None: + """Load model/tokenizer for inference if not already loaded. + + Loads from `trained_model_path` if set, otherwise from `output_dir`. + Also restores `id2label`/`label2id` from the model config when present, + moves the model to the configured device, and sets eval mode. + """ if self.model is not None and self.tokenizer is not None: return model_path = self.trained_model_path or self.output_dir self.model = AutoModelForSequenceClassification.from_pretrained(model_path) - self.tokenizer = self._load_robust_tokenizer(model_path) + try: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + except Exception: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) cfg = self.model.config if hasattr(cfg, "id2label") and hasattr(cfg, "label2id"): @@ -230,20 +325,49 @@ def _ensure_loaded_for_inference(self) -> None: self.model.to(self.device).eval() def _predict_label_ids(self, terms: List[str]) -> List[int]: + """Predict label ids (argmax) for a list of term strings. + + Ensures model/tokenizer are loaded, then performs forward passes + term-by-term and collects the argmax label id. + + Args: + terms: List of raw term texts. + + Returns: + List of integer label ids corresponding to `terms`. + """ self._ensure_loaded_for_inference() predictions: List[int] = [] - for term_text in tqdm(terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"): - inputs = self.tokenizer(term_text, return_tensors="pt", truncation=True, max_length=self.max_length) + for term_text in tqdm( + terms, desc="Inference", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}" + ): + inputs = self.tokenizer( + term_text, + return_tensors="pt", + truncation=True, + max_length=self.max_length, + ) inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()} with torch.no_grad(): logits = self.model(**inputs).logits predictions.append(int(torch.argmax(logits, dim=-1).item())) return predictions - def _predict_structured_output(self, terms: List[str]) -> List[Dict[str, List[str]]]: + def _predict_structured_output( + self, terms: List[str] + ) -> List[Dict[str, List[str]]]: """ - Convert predicted IDs into evaluator format: - [{"term": "", "types": [""]}, ...] + Convert predicted label IDs into evaluator-friendly structured outputs. + + The output format is: + [{"term": "", "types": [""]}, ...] + + Args: + terms: Raw term texts to classify. + + Returns: + List of dicts mapping each input term to a list with its predicted + label string. Falls back to stringified id if label mapping is absent. """ label_ids = self._predict_label_ids(terms) id2label_map = self.id2label or {} # fallback handled below diff --git a/ontolearner/learner/term_typing/sbunlp.py b/ontolearner/learner/term_typing/sbunlp.py index f838bd0..d5c0114 100644 --- a/ontolearner/learner/term_typing/sbunlp.py +++ b/ontolearner/learner/term_typing/sbunlp.py @@ -20,123 +20,152 @@ from ...base import AutoLearner + class SBUNLPZSLearner(AutoLearner): """ Qwen-based blind term typing learner (Task B), implemented as an AutoLearner. - This class reproduces the notebook logic: - - Fit phase learns the *allowed type inventory* from training data. - - Predict phase performs blind prompting per term using the learned type list. - - Outputs are restricted to the allowed types and returned as [{"id", "types"}]. - - Expected I/O (recommended): - - fit(train_data, task="term-typing", ontologizer=True): - The framework's AutoLearner.tasks_data_former() provides a unique list of - type labels; we store it to `self.allowed_types`. - - predict(eval_data, task="term-typing", ontologizer=False): - Pass a list of dicts with keys {"id": str, "term": str} so IDs are preserved. - Returns a list of dicts [{"id": ..., "types": [...] }]. + Lifecycle: + • `fit(...)` learns/records the allowed type inventory from the training payload. + • `load(...)` explicitly loads the tokenizer/model (pass `model_id`/`token` here). + • `predict(...)` prompts the model per term and returns normalized types limited + to the learned inventory. """ def __init__( self, - model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", - device: Optional[str] = None, + device: str = "cpu", max_new_tokens: int = 64, temperature: float = 0.0, + model_id: str = "Qwen/Qwen2.5-0.5B-Instruct", token: Optional[str] = None, ) -> None: """ + Configure runtime knobs. Model identity and auth are provided to `load(...)`. + Args: - model_id: HF model id for Qwen. - device: "cuda", "mps", or "cpu". Auto-detected if None. - max_new_tokens: Generation cap per prompt. - temperature: Not used for greedy decoding (kept for future). - token: HF token if the model is gated. + device: Torch device policy ("cuda", "mps", or "cpu"). + max_new_tokens: Max tokens to generate per prompt (greedy decoding). + temperature: Reserved for future sampling; generation is greedy here. + model_id: Fallback model id/path used if `load()` is called without args. + token: Fallback HF token used if `load()` is called without args. + + Side Effects: + Initializes runtime configuration, instance defaults for `load()`, + and placeholders for `tokenizer`, `model`, and `allowed_types`. """ super().__init__() - - # Basic configuration - self.model_id = model_id - # default device detection: prefer CUDA if available - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.device = device self.max_new_tokens = max_new_tokens self.temperature = temperature + + # Defaults that load() may use when its args are None + self.model_id = model_id self.token = token - # Model/tokenizer placeholders (populated by load()) + # Placeholders populated by load() self.tokenizer: Optional[AutoTokenizer] = None self.model: Optional[AutoModelForCausalLM] = None - # Learned inventory of allowed type labels (populated by fit()) + # Learned inventory self.allowed_types: List[str] = [] - # Regex used to extract quoted strings from model output (e.g. "type") + # Regex used to extract quoted strings from model output (e.g., "type") self._quoted_re = re.compile(r'"([^"]+)"') - def load(self, **kwargs: Any): + def load( + self, + model_id: Optional[str] = None, + token: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + ): """ - Load Qwen model and tokenizer. + Load tokenizer and model weights explicitly. - NOTE: - - The HF arguments used here mirror your original code (`token=...`). - You may see a deprecation warning for `torch_dtype` (older transformers); - switching to `dtype=` is recommended but I did not change behavior here. - """ - # Respect overrides from kwargs if provided - model_id = kwargs.get("model_id", self.model_id) - token = kwargs.get("token", self.token) + Argument precedence: + 1) Use `model_id` / `token` passed to this method (if provided). + 2) Else fall back to `self.model_id` / `self.token`. + + Device & dtype: + • If `dtype` is None, the default is float16 on CUDA/MPS and float32 on CPU. + • `device_map` is `"auto"` for non-CPU devices, `"cpu"` otherwise. + + Args: + model_id: HF model id/path to load. If None, uses `self.model_id`. + token: HF token if the model is gated. If None, uses `self.token`. + dtype: Optional torch dtype override (e.g., `torch.float16`). - # Load tokenizer. If the model is gated, pass token (original code uses `token`). - # If your environment requires `use_auth_token=` replace here. - self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) + Returns: + self + """ + resolved_model_id = model_id or self.model_id + resolved_token = token if token is not None else self.token - # Ensure tokenizer has a pad token (some models omit it) + # Tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + resolved_model_id, token=resolved_token + ) if self.tokenizer.pad_token is None: + # Prefer EOS as pad if available self.tokenizer.pad_token = self.tokenizer.eos_token - # Device mapping for from_pretrained -> keep same behavior as original code + # Device & dtype + if dtype is None: + if self.device == "cpu": + resolved_dtype = torch.float32 + else: + # Works for CUDA and Apple MPS + resolved_dtype = torch.float16 + else: + resolved_dtype = dtype + device_map = "auto" if self.device != "cpu" else "cpu" - # original code used torch_dtype; left as-is to avoid behavioral change - torch_dtype = torch.float16 if self.device != "cpu" else torch.float32 - # Load the model weights. This can be heavy; keep same params as original. self.model = AutoModelForCausalLM.from_pretrained( - model_id, + resolved_model_id, device_map=device_map, - torch_dtype=torch_dtype, - token=token, + torch_dtype=resolved_dtype, # keep torch_dtype for broad Transformers compatibility + token=resolved_token, ) return self - # ------------------------------------------------------------------------- - # Fit / Predict interface - # ------------------------------------------------------------------------- def fit(self, train_data: Any, task: str, ontologizer: bool = True): """ Learn the allowed type inventory from the training data. - Expected behavior: - - If `tasks_data_former(..., test=False)` returns a list of strings, - set allowed_types to that list (deduped & sorted). - - If it returns a list of dicts (relationships), extract unique 'parent' - fields and use those as the allowed type inventory. + Normalization rules: + • If `ontologizer=True`, the framework's `tasks_data_former(..., test=False)` + is used to normalize `train_data`. + • If a container exposes `.term_typings`, types are collected from there. + • If the normalized data is a list of dicts with `"parent"`, unique parents + become the allowed types. + • If it's a list of strings, that unique set becomes the allowed types. - This method contains a tolerant branch for the framework's custom container: - If the returned `train_fmt` is not a list but has a `.term_typings` attribute - (e.g., OntologyData object used by the framework), iterate that attribute - and collect any `types` values found. + Args: + train_data: Training payload provided by the pipeline. + task: Must be `"term-typing"`. + ontologizer: If True, normalize via `tasks_data_former()` first. + + Returns: + self + + Raises: + ValueError: If `task` is not `"term-typing"`. + TypeError: If the training data cannot be normalized to a list of + strings or relationship dicts. """ - train_fmt = self.tasks_data_former(data=train_data, task=task, test=False) if ontologizer else train_data + train_fmt = ( + self.tasks_data_former(data=train_data, task=task, test=False) + if ontologizer + else train_data + ) if task != "term-typing": raise ValueError("SBUNLPZSLearner only implements 'term-typing'.") # If framework passed a container with `.term_typings`, extract types from there if not isinstance(train_fmt, list): - # handle OntologyData-like object with attribute 'term_typings' if hasattr(train_fmt, "term_typings"): try: - # term_typings is expected to be an iterable of objects with attribute `types` collected = set() for tt in getattr(train_fmt, "term_typings") or []: # tt.types could be list[str] or a single str @@ -147,7 +176,6 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True): else: tvals = None - # Normalize both list and single-string cases if isinstance(tvals, (list, tuple, set)): for x in tvals: if isinstance(x, str): @@ -155,145 +183,180 @@ def fit(self, train_data: Any, task: str, ontologizer: bool = True): elif isinstance(tvals, str): collected.add(tvals) - # If we successfully collected types, set allowed_types and return if collected: self.allowed_types = sorted(collected) return self - # else fall through to error below (no types found) except Exception: - # If anything unexpected occurs while iterating term_typings, - # gracefully fall through and raise the original TypeError below. + # Fall through to error below if unexpected issues occur. pass - # not a supported non-list type -> keep original behavior (raise) raise TypeError("For term-typing, expected a list of type labels at fit().") # At this point train_fmt is a list (original logic preserved) if train_fmt and isinstance(train_fmt[0], dict) and "parent" in train_fmt[0]: # Case A: Received raw relationships/pairs (e.g., from train_test_split). - # Extract unique parent types from the relationship records. unique_types = set(r.get("parent") for r in train_fmt if r.get("parent")) self.allowed_types = sorted(unique_types) elif all(isinstance(x, str) for x in train_fmt): # Case B: Received a clean list of type labels (List[str]). self.allowed_types = sorted(set(train_fmt)) else: - # The input is a list but not in either expected format -> raise - raise TypeError("For term-typing, input data format for fit() is invalid. Expected list of strings (types) or list of relationships (dicts).") + raise TypeError( + "For term-typing, input data format for fit() is invalid. " + "Expected list of strings (types) or list of relationships (dicts)." + ) return self def predict(self, eval_data: Any, task: str, ontologizer: bool = True) -> Any: """ - Predict types for each term. + Predict types for each term and return standardized rows. Expected inputs: - - With ontologizer=True: a list[str] of term strings (IDs are autogenerated). - - With ontologizer=False: a list[dict] where each dict has keys {'id','term'}. + • With `ontologizer=True`: a `list[str]` of terms (IDs are auto-generated), + or a container exposing `.term_typings` from which `{'id','term'}` pairs + can be extracted. + • With `ontologizer=False`: a `list[dict]` of `{'id','term'}` to preserve IDs. + + Args: + eval_data: Evaluation payload as described above. + task: Must be `"term-typing"`. + ontologizer: If True, normalize through the pipeline’s data former. - This method tolerantly converts common framework containers (e.g., an - OntologyData object exposing `.term_typings`) into the expected list[dict] - shape so that the internal _term_typing() can run unchanged. + Returns: + A list of dictionaries: + `{"id": str, "term": str, "types": List[str]}`. """ if task != "term-typing": # Delegate to base for other tasks (not implemented here) return super().predict(eval_data, task, ontologizer=ontologizer) - def _extract_list_of_dicts_from_term_typings(obj) -> Optional[List[Dict[str, str]]]: - """ - Helper: try to produce a list of {"id","term"} dicts from objects - exposing a `term_typings` iterable. Supports either object-like - TermTyping (attributes) or dict-style entries. - """ + def _extract_list_of_dicts_from_term_typings( + obj, + ) -> Optional[List[Dict[str, str]]]: + """Try to derive `[{id, term}, ...]` from an object with `.term_typings`.""" tts = getattr(obj, "term_typings", None) if tts is None: return None out = [] for tt in tts: - # support object-style TermTyping (attributes) and dict-style if isinstance(tt, dict): - # try several common key names for ID tid = tt.get("ID") or tt.get("id") or tt.get("Id") or tt.get("ID_") tterm = tt.get("term") or tt.get("label") or tt.get("name") else: - # object-style access - tid = getattr(tt, "ID", None) or getattr(tt, "id", None) or getattr(tt, "Id", None) - tterm = getattr(tt, "term", None) or getattr(tt, "label", None) or getattr(tt, "name", None) + tid = ( + getattr(tt, "ID", None) + or getattr(tt, "id", None) + or getattr(tt, "Id", None) + ) + tterm = ( + getattr(tt, "term", None) + or getattr(tt, "label", None) + or getattr(tt, "name", None) + ) if tid is None or tterm is None: - # skip malformed entry - this is defensive so downstream code has valid inputs continue out.append({"id": str(tid), "term": str(tterm)}) return out if out else None # Case A: ontologizer=True -> framework often provides list[str] if ontologizer: - if isinstance(eval_data, list) and all(isinstance(x, str) for x in eval_data): - # Simple case: convert list of terms to list of dicts with generated IDs - eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data)] + if isinstance(eval_data, list) and all( + isinstance(x, str) for x in eval_data + ): + eval_pack = [ + {"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(eval_data) + ] else: - # Try to extract from a framework container (e.g., OntologyData) maybe = _extract_list_of_dicts_from_term_typings(eval_data) if maybe is not None: eval_pack = maybe else: - # Last resort: if eval_data is some iterable of strings, convert it - try: - if hasattr(eval_data, "__iter__") and not isinstance(eval_data, (str, bytes)): - lst = list(eval_data) - if all(isinstance(x, str) for x in lst): - eval_pack = [{"id": f"TT_{i:06d}", "term": t} for i, t in enumerate(lst)] - else: - raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") + # Last resort: attempt to coerce iterables of str + if hasattr(eval_data, "__iter__") and not isinstance( + eval_data, (str, bytes) + ): + lst = list(eval_data) + if all(isinstance(x, str) for x in lst): + eval_pack = [ + {"id": f"TT_{i:06d}", "term": t} + for i, t in enumerate(lst) + ] else: - raise TypeError("With ontologizer=True, eval_data must be list[str] of terms.") - except TypeError: - # re-raise to preserve original error semantics - raise - # Delegate to internal inference routine + raise TypeError( + "With ontologizer=True, eval_data must be list[str] of terms." + ) + else: + raise TypeError( + "With ontologizer=True, eval_data must be list[str] of terms." + ) return self._term_typing(eval_pack, test=True) - # Case B: ontologizer=False -> we expect list[dict], but tolerate common containers + # Case B: ontologizer=False -> expect list[dict], but tolerate containers else: - if isinstance(eval_data, list) and all(isinstance(x, dict) for x in eval_data): + if isinstance(eval_data, list) and all( + isinstance(x, dict) for x in eval_data + ): eval_pack = eval_data else: - # Try to extract from framework container (term_typings) maybe = _extract_list_of_dicts_from_term_typings(eval_data) if maybe is not None: eval_pack = maybe else: - # As a final attempt, allow eval_data to be a dict with a list under some known keys if isinstance(eval_data, dict): for key in ("term_typings", "terms", "items"): - if key in eval_data and isinstance(eval_data[key], (list, tuple)): + if key in eval_data and isinstance( + eval_data[key], (list, tuple) + ): converted = [] for x in eval_data[key]: - # Accept dict-style entries that include id and term/name - if isinstance(x, dict) and ("id" in x or "ID" in x) and ("term" in x or "name" in x): + if ( + isinstance(x, dict) + and ("id" in x or "ID" in x) + and ("term" in x or "name" in x) + ): tid = x.get("ID") or x.get("id") tterm = x.get("term") or x.get("name") - converted.append({"id": str(tid), "term": str(tterm)}) + converted.append( + {"id": str(tid), "term": str(tterm)} + ) if converted: eval_pack = converted break else: - # Could not convert; raise same TypeError as before - raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") + raise TypeError( + "With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}." + ) else: - # Not a supported container -> raise - raise TypeError("With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}.") - # Delegate to internal inference routine + raise TypeError( + "With ontologizer=False, eval_data must be a list of dicts with keys {'id','term'}." + ) return self._term_typing(eval_pack, test=True) - - # ------------------------------------------------------------------------- - # Internal task implementations (AutoLearner hooks) - # ------------------------------------------------------------------------- def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: """ - Core implementation: - - training mode (test=False): `data` is a list of allowed type labels -> store them. - - inference mode (test=True): `data` is a list of {"id","term"} -> produce [{"id","types"}]. + Internal implementation of the *term-typing* task. + + Training mode (`test=False`): + • Expects a `list[str]` of allowed types. Stores a sorted unique copy. + + Inference mode (`test=True`): + • Expects a `list[dict]` of `{"id","term"}` items. + • Requires `load()` to have been called (model/tokenizer available). + • Builds a blind prompt per item, generates text, parses quoted + candidates, and filters them to `self.allowed_types`. + + Args: + data: See the mode-specific expectations above. + test: Set `True` to run inference; `False` to store the type inventory. + + Returns: + • `None` in training mode. + • `list[dict]` with `{"id","term","types":[...]}` in inference mode. + + Raises: + TypeError: If `data` is not in the expected shape for the mode. + RuntimeError: If model/tokenizer are not loaded at inference time. """ if not test: # training: expect a list of strings (type labels) @@ -304,49 +367,58 @@ def _term_typing(self, data: Any, test: bool = False) -> Optional[Any]: # Inference path if not isinstance(data, list) or not all(isinstance(x, dict) for x in data): - raise TypeError("At prediction time, expected a list of {'id','term'} dicts.") + raise TypeError( + "At prediction time, expected a list of {'id','term'} dicts." + ) - # Ensure model and tokenizer are loaded if self.model is None or self.tokenizer is None: - raise RuntimeError("Model/tokenizer not loaded. Call .load() before predict().") + raise RuntimeError( + "Model/tokenizer not loaded. Call .load() before predict()." + ) results = [] for item in data: - # preserve incoming IDs and terms term_id = item["id"] term_text = item["term"] - - # build the blind JSON-prompt that instructs the model to output types prompt = self._build_blind_prompt(term_id, term_text, self.allowed_types) - - # generate and parse model output into allowed types types = self._generate_and_parse_types(prompt) - - # append result for this term (keep original id) - # include the original term so downstream evaluation (and any consumers) can match by term results.append({"id": term_id, "term": term_text, "types": types}) return results - # ------------------------------------------------------------------------- - # Prompting + parsing - # ------------------------------------------------------------------------- - - def _format_types_inline(allowed: List[str]) -> str: + def _format_types_inline(self, allowed: List[str]) -> str: """ - Format allowed types as comma-separated quoted strings for insertion into the prompt. - Example: '"type1", "type2", "type3"' + Format the allowed types for inline inclusion in prompts. + + Args: + allowed: List of allowed type labels. + + Returns: + A comma-separated string of quoted types, e.g.: + `"type1", "type2", "type3"`. Returns an empty string for an empty list. """ - return ", ".join(f'"{t}"' for t in allowed) + if not allowed: + return "" + return ", ".join(f'"{t}"' for t in allowed if isinstance(t, str) and t.strip()) - def _build_blind_prompt(self, term_id: str, term: str, allowed_types: List[str]) -> str: + def _build_blind_prompt( + self, term_id: str, term: str, allowed_types: List[str] + ) -> str: """ - Construct the prompt given a single term. The prompt: - - Instructs the model to produce a JSON array of {id, types} objects. - - Provides the allowed types list (so the model should only use those). - - Includes the single input item for which the model must decide types. + Construct the blind JSON prompt for a single term. + + The prompt: + • Instructs the model to produce ONLY a JSON array of `{id, types}` objects. + • Provides the allowed types list so the model should only use those. + • Includes the single input item for which the model must decide types. + + Args: + term_id: Identifier to carry through to the output JSON. + term: The input term string to classify. + allowed_types: Inventory used to constrain outputs. - Note: This is the same blind-prompting approach used in the original notebook. + Returns: + The full prompt string to feed to the LLM. """ allowed_str = self._format_types_inline(allowed_types) return ( @@ -367,14 +439,22 @@ def _build_blind_prompt(self, term_id: str, term: str, allowed_types: List[str]) def _generate_and_parse_types(self, prompt: str) -> List[str]: """ - Greedy generate, then extract quoted strings and filter by allowed types. - - Important details: - - We assert model/tokenizer presence before calling. - - Tokenized inputs are moved to the model device (original code uses .to(self.model.device)). - - The decoded text is scanned for quoted substrings using self._quoted_re. - - Only quoted strings that are present in self.allowed_types are kept. - - Returned list is unique & sorted for deterministic ordering. + Greedy-generate text, extract candidate types, and filter to the inventory. + + Workflow: + 1) Tokenize the prompt and generate deterministically (greedy). + 2) Decode and extract quoted substrings via regex (e.g., `"type"`). + 3) Keep only those candidates that exist in `self.allowed_types`. + 4) Return a unique, sorted list (stable across runs). + + Args: + prompt: Fully formatted prompt string. + + Returns: + List of predicted type labels (possibly empty if none found). + + Raises: + AssertionError: If `model` or `tokenizer` are unexpectedly `None`. """ assert self.model is not None and self.tokenizer is not None @@ -393,8 +473,6 @@ def _generate_and_parse_types(self, prompt: str) -> List[str]: text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) candidates = self._quoted_re.findall(text) - # Filter candidates to the allowed inventory + # Filter candidates to the allowed inventory and stabilize order. filtered = [c for c in candidates if c in self.allowed_types] - - # Return unique & sorted for stability across runs return sorted(set(filtered)) diff --git a/ontolearner/learner/text2onto/__init__.py b/ontolearner/learner/text2onto/__init__.py deleted file mode 100644 index 6408881..0000000 --- a/ontolearner/learner/text2onto/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2025 SciKnowOrg -# -# Licensed under the MIT License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/MIT -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .sbunlp import SBUNLPFewShotLearner -from .alexbek import AlexbekFewShotLearner diff --git a/ontolearner/learner/text2onto/alexbek.py b/ontolearner/learner/text2onto/alexbek.py index 5760dca..f1692f7 100644 --- a/ontolearner/learner/text2onto/alexbek.py +++ b/ontolearner/learner/text2onto/alexbek.py @@ -31,6 +31,7 @@ class _PredictedTypesSchema(BaseModel): """Schema used when generating structured JSON { "types": [...] }.""" + types: List[str] OUTLINES_AVAILABLE: bool = True @@ -41,6 +42,7 @@ class _PredictedTypesSchema(BaseModel): OutlinesTFModel = None outlines_generate_json = None + class LocalAutoLLM(AutoLLM): """ Minimal local LLM helper. @@ -101,11 +103,15 @@ def load(self, model_id: str, *, load_in_4bit: bool = False) -> None: token=self.token, ) else: - device_map = "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + device_map = ( + "auto" if (self.device != "cpu" and torch.cuda.is_available()) else None + ) self.model = AutoModelForCausalLM.from_pretrained( model_id, device_map=device_map, - torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + torch_dtype=torch.bfloat16 + if torch.cuda.is_available() + else torch.float32, token=self.token, ) @@ -134,11 +140,17 @@ def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: Decoded new-token texts (no special tokens, stripped). """ if self.model is None or self.tokenizer is None: - raise RuntimeError("Call .load(model_id) on LocalAutoLLM before generate().") + raise RuntimeError( + "Call .load(model_id) on LocalAutoLLM before generate()." + ) - tokenized_batch = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True) + tokenized_batch = self.tokenizer( + prompts, return_tensors="pt", padding=True, truncation=True + ) input_seq_len = tokenized_batch["input_ids"].shape[1] - tokenized_batch = {k: v.to(self.model.device) for k, v in tokenized_batch.items()} + tokenized_batch = { + k: v.to(self.model.device) for k, v in tokenized_batch.items() + } with torch.no_grad(): outputs = self.model.generate( @@ -151,7 +163,11 @@ def generate(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]: # Only return the newly generated part for each row in the batch continuation_token_ids = outputs[:, input_seq_len:] - return [self.tokenizer.decode(row, skip_special_tokens=True).strip() for row in continuation_token_ids] + return [ + self.tokenizer.decode(row, skip_special_tokens=True).strip() + for row in continuation_token_ids + ] + class AlexbekFewShotLearner(AutoLearner): """ @@ -168,6 +184,7 @@ class AlexbekFewShotLearner(AutoLearner): Reads your A1 results (docs→terms), predicts types for each term, and writes two files: terms2types_pred.json + types2docs_pred.json """ + def __init__(self, model: LocalAutoLLM, device: str = "cpu", **_: Any) -> None: """ Initialize learner state and canned prompts. @@ -243,7 +260,9 @@ def fit( # Load item -> [doc_ids] item_to_docs_map = self._load_json(terms2doc_json) if not isinstance(item_to_docs_map, dict): - raise ValueError(f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]") + raise ValueError( + f"{terms2doc_json} must be a JSON dict mapping item -> [doc_ids]" + ) # Reverse mapping: doc_id -> [items] doc_id_to_items_map: Dict[str, List[str]] = {} @@ -258,17 +277,25 @@ def fit( if not doc_row: continue doc_title = str(doc_row.get("title", "")) # be defensive (may be None) - doc_text = self._to_text(doc_row.get("text", "")) # string-ify list if needed + doc_text = self._to_text( + doc_row.get("text", "") + ) # string-ify list if needed if not doc_text: continue - gold_items = self._unique_preserve([s for s in labeled_items if isinstance(s, str)]) + gold_items = self._unique_preserve( + [s for s in labeled_items if isinstance(s, str)] + ) if gold_items: exemplar_candidates.append((doc_title, doc_text, gold_items)) if not exemplar_candidates: - raise RuntimeError("No candidate docs with items found to build few-shot exemplars.") + raise RuntimeError( + "No candidate docs with items found to build few-shot exemplars." + ) - chosen_exemplars = rng.sample(exemplar_candidates, k=min(sample_size, len(exemplar_candidates))) + chosen_exemplars = rng.sample( + exemplar_candidates, k=min(sample_size, len(exemplar_candidates)) + ) # Reuse exemplars for both docs→terms and docs→types prompting self._fewshot_terms_docs = chosen_exemplars self._fewshot_types_docs = chosen_exemplars @@ -315,7 +342,10 @@ def predict_terms( text = self._to_text(document_row.get("text", "")) fewshot_block = self._format_fewshot_block( - self._system_prompt_terms, self._fewshot_terms_docs, key="terms", k=few_shot_k + self._system_prompt_terms, + self._fewshot_terms_docs, + key="terms", + k=few_shot_k, ) user_block = self._format_user_block(title, text) @@ -323,7 +353,9 @@ def predict_terms( document_order.append(document_id) generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) - parsed_term_lists = [self._parse_json_list(generated, key="terms") for generated in generations] + parsed_term_lists = [ + self._parse_json_list(generated, key="terms") for generated in generations + ] os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) lines_written = 0 @@ -334,7 +366,6 @@ def predict_terms( lines_written += 1 return lines_written - def predict_types( self, *, @@ -377,7 +408,10 @@ def predict_types( text = self._to_text(document_row.get("text", "")) fewshot_block = self._format_fewshot_block( - self._system_prompt_types, self._fewshot_types_docs, key="types", k=few_shot_k + self._system_prompt_types, + self._fewshot_types_docs, + key="types", + k=few_shot_k, ) user_block = self._format_user_block(title, text) @@ -385,7 +419,9 @@ def predict_types( document_order.append(document_id) generations = self.model.generate(prompts, max_new_tokens=max_new_tokens) - parsed_type_lists = [self._parse_json_list(generated, key="types") for generated in generations] + parsed_type_lists = [ + self._parse_json_list(generated, key="types") for generated in generations + ] os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True) lines_written = 0 @@ -426,7 +462,9 @@ def evaluate_extraction_f1( gold_doc_to_items: Dict[str, set] = {} for item_label, doc_id_list in item_to_doc_ids.items(): for document_id in doc_id_list: - gold_doc_to_items.setdefault(document_id, set()).add(self._norm(item_label)) + gold_doc_to_items.setdefault(document_id, set()).add( + self._norm(item_label) + ) # Build predictions: doc_id -> set(items) pred_doc_to_items: Dict[str, set] = {} @@ -435,7 +473,9 @@ def evaluate_extraction_f1( row = json.loads(line.strip()) document_id = str(row.get("id", "")) items_list = row.get("terms" if key == "term" else "types", []) - pred_doc_to_items[document_id] = {self._norm(x) for x in items_list if isinstance(x, str)} + pred_doc_to_items[document_id] = { + self._norm(x) for x in items_list if isinstance(x, str) + } # Micro counts true_positive = false_positive = false_negative = 0 @@ -447,18 +487,34 @@ def evaluate_extraction_f1( false_positive += len(pred_set - gold_set) false_negative += len(gold_set - pred_set) - precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0 - recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0 - f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0 + precision = ( + true_positive / (true_positive + false_positive) + if (true_positive + false_positive) + else 0.0 + ) + recall = ( + true_positive / (true_positive + false_negative) + if (true_positive + false_negative) + else 0.0 + ) + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) + else 0.0 + ) return f1 def predict_types_from_terms( self, *, - doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl - doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list - few_shot_jsonl: Optional[str] = None, # JSONL lines: {"term":"...", "types":[...]} - rag_terms_json: Optional[str] = None, # JSON list; items may contain "term" and "RAG":[...] + doc_terms_jsonl: Optional[str] = None, # formerly a1_results_jsonl + doc_terms_list: Optional[List[Dict]] = None, # formerly a1_results_list + few_shot_jsonl: Optional[ + str + ] = None, # JSONL lines: {"term":"...", "types":[...]} + rag_terms_json: Optional[ + str + ] = None, # JSON list; items may contain "term" and "RAG":[...] random_few_shot: Optional[int] = 3, model_id: str = "Qwen/Qwen2.5-1.5B-Instruct", use_structured_output: bool = True, @@ -507,7 +563,9 @@ def predict_types_from_terms( in_memory_results=doc_terms_list, ) if not doc_term_extractions: - raise ValueError("No document→terms results provided (doc_terms_jsonl/doc_terms_list).") + raise ValueError( + "No document→terms results provided (doc_terms_jsonl/doc_terms_list)." + ) # Prepare unique term list and term→doc occurrences unique_terms = self._collect_unique_terms_from_extractions(doc_term_extractions) @@ -525,7 +583,11 @@ def predict_types_from_terms( json_obj = json.loads(raw_line) except Exception: continue - if isinstance(json_obj, dict) and "term" in json_obj and "types" in json_obj: + if ( + isinstance(json_obj, dict) + and "term" in json_obj + and "types" in json_obj + ): global_few_shot_examples.append(json_obj) # Optional per-term RAG examples: {normalized_term -> [examples]} @@ -536,8 +598,12 @@ def predict_types_from_terms( if isinstance(rag_payload, list): for rag_item in rag_payload: if isinstance(rag_item, dict): - normalized_term = self._normalize_term(rag_item.get("term", "")) - rag_examples_lookup[normalized_term] = rag_item.get("RAG", []) + normalized_term = self._normalize_term( + rag_item.get("term", "") + ) + rag_examples_lookup[normalized_term] = rag_item.get( + "RAG", [] + ) except Exception: pass @@ -550,7 +616,10 @@ def predict_types_from_terms( normalized_term = self._normalize_term(term_text) # Prefer per-term RAG for this term, else use global few-shot - few_shot_examples_for_term = rag_examples_lookup.get(normalized_term, None) or global_few_shot_examples + few_shot_examples_for_term = ( + rag_examples_lookup.get(normalized_term, None) + or global_few_shot_examples + ) # Build conversation and prompt conversation_messages = self._build_conv_for_type_infer( @@ -558,28 +627,51 @@ def predict_types_from_terms( few_shot_examples=few_shot_examples_for_term, random_k=random_few_shot, ) - typing_prompt_string = self._apply_chat_template_safe_types(typing_tokenizer, conversation_messages) + typing_prompt_string = self._apply_chat_template_safe_types( + typing_tokenizer, conversation_messages + ) predicted_types: List[str] = [] raw_generation_text: str = "" # Structured JSON path (if requested and available) - if use_structured_output and OUTLINES_AVAILABLE and _PredictedTypesSchema is not None: + if ( + use_structured_output + and OUTLINES_AVAILABLE + and _PredictedTypesSchema is not None + ): try: outlines_model = OutlinesTFModel(typing_model, typing_tokenizer) # type: ignore - generator = outlines_generate_json(outlines_model, _PredictedTypesSchema) # type: ignore + generator = outlines_generate_json( + outlines_model, _PredictedTypesSchema + ) # type: ignore structured = generator(typing_prompt_string, max_tokens=512) - predicted_types = [label for label in structured.types if isinstance(label, str)] - raw_generation_text = json.dumps({"types": predicted_types}, ensure_ascii=False) + predicted_types = [ + label for label in structured.types if isinstance(label, str) + ] + raw_generation_text = json.dumps( + {"types": predicted_types}, ensure_ascii=False + ) except Exception: # Fall back to greedy decoding use_structured_output = False # Greedy decode fallback - if not use_structured_output or not OUTLINES_AVAILABLE or _PredictedTypesSchema is None: - tokenized_prompt = typing_tokenizer(typing_prompt_string, return_tensors="pt", truncation=True, max_length=2048) + if ( + not use_structured_output + or not OUTLINES_AVAILABLE + or _PredictedTypesSchema is None + ): + tokenized_prompt = typing_tokenizer( + typing_prompt_string, + return_tensors="pt", + truncation=True, + max_length=2048, + ) if torch.cuda.is_available(): - tokenized_prompt = {name: tensor.cuda() for name, tensor in tokenized_prompt.items()} + tokenized_prompt = { + name: tensor.cuda() for name, tensor in tokenized_prompt.items() + } with torch.no_grad(): output_ids = typing_model.generate( **tokenized_prompt, @@ -588,14 +680,18 @@ def predict_types_from_terms( num_beams=1, pad_token_id=typing_tokenizer.eos_token_id, ) - new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1]:] - raw_generation_text = typing_tokenizer.decode(new_token_span, skip_special_tokens=True) + new_token_span = output_ids[0][tokenized_prompt["input_ids"].shape[1] :] + raw_generation_text = typing_tokenizer.decode( + new_token_span, skip_special_tokens=True + ) predicted_types = self._extract_types_from_text(raw_generation_text) - term_to_predicted_types_list.append({ - "term": term_text, - "predicted_types": sorted(set(predicted_types)), - }) + term_to_predicted_types_list.append( + { + "term": term_text, + "predicted_types": sorted(set(predicted_types)), + } + ) # 7) Build types→docs from (term→types) and (term→docs) types_to_doc_id_set: Dict[str, set] = {} @@ -603,16 +699,24 @@ def predict_types_from_terms( normalized_term = self._normalize_term(term_prediction["term"]) doc_ids_for_term = term_to_doc_ids_map.get(normalized_term, []) for type_label in term_prediction.get("predicted_types", []): - types_to_doc_id_set.setdefault(type_label, set()).update(doc_ids_for_term) + types_to_doc_id_set.setdefault(type_label, set()).update( + doc_ids_for_term + ) types_to_doc_ids: Dict[str, List[str]] = { - type_label: sorted(doc_id_set) for type_label, doc_id_set in types_to_doc_id_set.items() + type_label: sorted(doc_id_set) + for type_label, doc_id_set in types_to_doc_id_set.items() } # 8) Save outputs os.makedirs(os.path.dirname(out_terms2types) or ".", exist_ok=True) with open(out_terms2types, "w", encoding="utf-8") as fp_terms2types: - json.dump(term_to_predicted_types_list, fp_terms2types, ensure_ascii=False, indent=2) + json.dump( + term_to_predicted_types_list, + fp_terms2types, + ensure_ascii=False, + indent=2, + ) os.makedirs(os.path.dirname(out_types2docs) or ".", exist_ok=True) with open(out_types2docs, "w", encoding="utf-8") as fp_types2docs: @@ -635,7 +739,6 @@ def _load_json(self, path: str) -> Dict[str, Any]: with open(path, "r", encoding="utf-8") as file_obj: return json.load(file_obj) - def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: """ Iterate over *all* JSON objects found inside a string. @@ -669,7 +772,6 @@ def _iter_json_objects(self, blob: str) -> Iterable[Dict[str, Any]]: yield json_obj cursor_index = end_index - def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: """ Robust reader that supports: @@ -727,7 +829,6 @@ def _load_documents_jsonl(self, path: str) -> Dict[str, Dict[str, Any]]: return documents_by_id - def _to_text(self, text_field: Any) -> str: """ Convert a 'text' field into a single string (handles list-of-strings). @@ -748,7 +849,6 @@ def _to_text(self, text_field: Any) -> str: return " ".join(str(part) for part in text_field) return str(text_field) if text_field is not None else "" - def _unique_preserve(self, values: List[str]) -> List[str]: """ Deduplicate values while preserving the original order. @@ -771,7 +871,6 @@ def _unique_preserve(self, values: List[str]) -> List[str]: ordered_values.append(candidate) return ordered_values - def _norm(self, text: str) -> str: """ Lowercased, single-spaced normalization (for comparisons). @@ -788,7 +887,6 @@ def _norm(self, text: str) -> str: """ return " ".join(text.lower().split()) - def _normalize_term(self, term: str) -> str: """ Normalization tailored for term keys / lookups. @@ -805,7 +903,6 @@ def _normalize_term(self, term: str) -> str: """ return " ".join(str(term).strip().split()).lower() - def _format_fewshot_block( self, system_prompt: str, @@ -846,10 +943,13 @@ def _format_fewshot_block( for example_title, example_text, gold_list in fewshot_examples[:k]: lines.append("### Example") lines.append(f"User:\nTitle: {example_title}\n{example_text}") - lines.append(f'Assistant:\n{{"{key}": ' + json.dumps(gold_list, ensure_ascii=False) + "}") + lines.append( + f'Assistant:\n{{"{key}": ' + + json.dumps(gold_list, ensure_ascii=False) + + "}" + ) return "\n".join(lines) - def _format_user_block(self, title: str, text: str) -> str: """ Format the 'Task' block for the current document. @@ -868,7 +968,6 @@ def _format_user_block(self, title: str, text: str) -> str: """ return f"### Task\nUser:\nTitle: {title}\n{text}" - def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: """ Extract a list from model output, trying: @@ -911,23 +1010,34 @@ def _parse_json_list(self, generated_text: str, *, key: str) -> List[str]: # 3) Fallback: comma-split (last resort) if "," in generated_text: - return [part.strip().strip('"').strip("'") for part in generated_text.split(",") if part.strip()] + return [ + part.strip().strip('"').strip("'") + for part in generated_text.split(",") + if part.strip() + ] return [] - - def _apply_chat_template_safe_types(self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]]) -> str: + def _apply_chat_template_safe_types( + self, tokenizer: AutoTokenizer, messages: List[Dict[str, str]] + ) -> str: """ Safely build a prompt string for chat models. Uses the model's chat template when available; otherwise falls back to a simple concatenation. """ try: - return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + return tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) except Exception: - system_text = next((m["content"] for m in messages if m.get("role") == "system"), "") - last_user_text = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + system_text = next( + (m["content"] for m in messages if m.get("role") == "system"), "" + ) + last_user_text = next( + (m["content"] for m in reversed(messages) if m.get("role") == "user"), + "", + ) return f"{system_text}\n\nUser:\n{last_user_text}\n\nAssistant:" - def _build_conv_for_type_infer( self, term: str, @@ -938,20 +1048,27 @@ def _build_conv_for_type_infer( Create a chat-style conversation for a single term→types query, optionally prepending few-shot examples. """ - messages: List[Dict[str, str]] = [{"role": "system", "content": self._system_prompt_term_to_types}] + messages: List[Dict[str, str]] = [ + {"role": "system", "content": self._system_prompt_term_to_types} + ] examples = list(few_shot_examples or []) if random_k and len(examples) > random_k: import random as _rnd + examples = _rnd.sample(examples, random_k) for exemplar in examples: example_term = exemplar.get("term", "") example_types = exemplar.get("types", []) messages.append({"role": "user", "content": f"Term: {example_term}"}) - messages.append({"role": "assistant", "content": json.dumps({"types": example_types}, ensure_ascii=False)}) + messages.append( + { + "role": "assistant", + "content": json.dumps({"types": example_types}, ensure_ascii=False), + } + ) messages.append({"role": "user", "content": f"Term: {term}"}) return messages - def _extract_types_from_text(self, generated_text: str) -> List[str]: """ Parse {"types":[...]} from a free-form generation. @@ -961,13 +1078,18 @@ def _extract_types_from_text(self, generated_text: str) -> List[str]: if object_match: json_obj = json.loads(object_match.group(0)) types_array = json_obj.get("types", []) - return [type_label for type_label in types_array if isinstance(type_label, str)] + return [ + type_label + for type_label in types_array + if isinstance(type_label, str) + ] except Exception: pass return [] - - def _load_llm_for_types(self, model_id: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + def _load_llm_for_types( + self, model_id: str + ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: """ Load a *separate* small chat model for Term→Types (keeps LocalAutoLLM untouched). """ @@ -981,7 +1103,6 @@ def _load_llm_for_types(self, model_id: str) -> Tuple[AutoModelForCausalLM, Auto ) return model, tokenizer - def _load_doc_term_extractions( self, *, @@ -1002,17 +1123,26 @@ def _load_doc_term_extractions( normalized_records: List[Dict] = [] def _coerce_to_record(source_row: Dict) -> Optional[Dict]: - document_id = str(source_row.get("id", "")) or str(source_row.get("doc_id", "")) + document_id = str(source_row.get("id", "")) or str( + source_row.get("doc_id", "") + ) if not document_id: return None terms = source_row.get("extracted_terms") if terms is None: terms = source_row.get("terms") - if terms is None and "payload" in source_row and isinstance(source_row["payload"], dict): + if ( + terms is None + and "payload" in source_row + and isinstance(source_row["payload"], dict) + ): terms = source_row["payload"].get("terms") if not isinstance(terms, list): terms = [] - return {"id": document_id, "extracted_terms": [t for t in terms if isinstance(t, str)]} + return { + "id": document_id, + "extracted_terms": [t for t in terms if isinstance(t, str)], + } if in_memory_results is not None: for source_row in in_memory_results: @@ -1053,8 +1183,9 @@ def _coerce_to_record(source_row: Dict) -> Optional[Dict]: return normalized_records - - def _collect_unique_terms_from_extractions(self, doc_term_extractions: List[Dict]) -> List[str]: + def _collect_unique_terms_from_extractions( + self, doc_term_extractions: List[Dict] + ) -> List[str]: """ Collect unique terms (original casing) from normalized document→terms results. """ @@ -1068,8 +1199,9 @@ def _collect_unique_terms_from_extractions(self, doc_term_extractions: List[Dict ordered_unique_terms.append(term_text.strip()) return ordered_unique_terms - - def _build_term_to_doc_ids(self, doc_term_extractions: List[Dict]) -> Dict[str, List[str]]: + def _build_term_to_doc_ids( + self, doc_term_extractions: List[Dict] + ) -> Dict[str, List[str]]: """ Build lookup: normalized_term -> sorted unique list of doc_ids. """ @@ -1081,4 +1213,7 @@ def _build_term_to_doc_ids(self, doc_term_extractions: List[Dict]) -> Dict[str, if not normalized or not document_id: continue term_to_doc_set.setdefault(normalized, set()).add(document_id) - return {normalized_term: sorted(doc_ids) for normalized_term, doc_ids in term_to_doc_set.items()} + return { + normalized_term: sorted(doc_ids) + for normalized_term, doc_ids in term_to_doc_set.items() + } diff --git a/ontolearner/learner/text2onto/sbunlp.py b/ontolearner/learner/text2onto/sbunlp.py index 8ab617d..49067e2 100644 --- a/ontolearner/learner/text2onto/sbunlp.py +++ b/ontolearner/learner/text2onto/sbunlp.py @@ -25,6 +25,7 @@ from ...base import AutoLearner, AutoLLM + # ----------------------------------------------------------------------------- # Concrete AutoLLM: local HF wrapper that follows the AutoLLM interface # ----------------------------------------------------------------------------- @@ -34,19 +35,29 @@ class LocalAutoLLM(AutoLLM): Uses 4-bit quantization for efficiency and greedy decoding by default. """ - def __init__(self, label_mapper: Any = None, device: str = "cpu", token: str = "") -> None: + def __init__( + self, label_mapper: Any = None, device: str = "cpu", token: str = "" + ) -> None: super().__init__(label_mapper=label_mapper, device=device, token=token) self.model = None self.tokenizer = None - def load(self, model_id: str, load_in_4bit: bool = False, dtype: str = "auto", trust_remote_code: bool = True): + def load( + self, + model_id: str, + load_in_4bit: bool = False, + dtype: str = "auto", + trust_remote_code: bool = True, + ): """Load tokenizer + model, applying 4-bit quantization if specified and possible.""" # Determine the target data type (default to float32 for CPU, float16 for GPU) - torch_dtype_val = (torch.float16 if torch.cuda.is_available() else torch.float32) + torch_dtype_val = torch.float16 if torch.cuda.is_available() else torch.float32 # Load the tokenizer - self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=trust_remote_code + ) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token @@ -78,7 +89,13 @@ def load(self, model_id: str, load_in_4bit: bool = False, dtype: str = "auto", t if self.device == "cpu": self.model.to("cpu") - def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: float = 0.0, top_p: float = 1.0) -> List[str]: + def generate( + self, + inputs: List[str], + max_new_tokens: int = 64, + temperature: float = 0.0, + top_p: float = 1.0, + ) -> List[str]: """Generate continuations for a list of prompts, returning only the generated part.""" if self.model is None or self.tokenizer is None: raise RuntimeError("Model/tokenizer not loaded. Call .load() first.") @@ -100,7 +117,9 @@ def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: flo input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, - do_sample=(temperature > 0.0), # Use greedy decoding if temperature is 0.0 + do_sample=( + temperature > 0.0 + ), # Use greedy decoding if temperature is 0.0 temperature=temperature, top_p=top_p, pad_token_id=self.tokenizer.eos_token_id, @@ -109,20 +128,25 @@ def generate(self, inputs: List[str], max_new_tokens: int = 64, temperature: flo # --- Post-processing: Extract only the generated tail --- decoded_outputs: List[str] = [] for i, output_ids in enumerate(outputs): - full_decoded_text = self.tokenizer.decode(output_ids, skip_special_tokens=True) + full_decoded_text = self.tokenizer.decode( + output_ids, skip_special_tokens=True + ) prompt_text = self.tokenizer.decode(input_ids[i], skip_special_tokens=True) # Safely strip the prompt text from the full output if full_decoded_text.startswith(prompt_text): - generated_tail = full_decoded_text[len(prompt_text):].strip() + generated_tail = full_decoded_text[len(prompt_text) :].strip() else: # Fallback extraction (less robust if padding affects token indices) prompt_len = input_ids.shape[1] - generated_tail = self.tokenizer.decode(output_ids[prompt_len:], skip_special_tokens=True).strip() + generated_tail = self.tokenizer.decode( + output_ids[prompt_len:], skip_special_tokens=True + ).strip() decoded_outputs.append(generated_tail) return decoded_outputs + # ----------------------------------------------------------------------------- # Main Learner: SBUNLPFewShotLearner (Task A Text2Onto) # ----------------------------------------------------------------------------- @@ -195,7 +219,11 @@ def build_stratified_fewshot_prompt( num_to_sample_from_stratum = int(num_sample_docs * proportion) if num_to_sample_from_stratum > 0: - sampled_documents.extend(random.sample(stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs))) + sampled_documents.extend( + random.sample( + stratum_docs, min(num_to_sample_from_stratum, num_stratum_docs) + ) + ) # Deduplicate sampled documents by ID and adjust count to exactly 'sample_size' unique_docs_by_id = {} @@ -207,8 +235,12 @@ def build_stratified_fewshot_prompt( if len(final_sample_docs) > num_sample_docs: final_sample_docs = random.sample(final_sample_docs, num_sample_docs) elif len(final_sample_docs) < num_sample_docs: - remaining_docs = [d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id] - needed_count = min(num_sample_docs - len(final_sample_docs), len(remaining_docs)) + remaining_docs = [ + d for d in corpus_documents if d.get("id", "") not in unique_docs_by_id + ] + needed_count = min( + num_sample_docs - len(final_sample_docs), len(remaining_docs) + ) final_sample_docs.extend(random.sample(remaining_docs, needed_count)) # Format the few-shot exemplar text block @@ -299,21 +331,31 @@ def build_types_fewshot_block( picked_count += 1 if picked_count >= sample_per_term: - break # Move to the next term + break # Move to the next term prompt_block = "\n".join(prompt_lines) self.fewshot_types_block = prompt_block return prompt_block - def fit(self, train_docs_jsonl: str, terms2doc_json: str, sample_size: int = 28, seed: int = 123) -> None: + def fit( + self, + train_docs_jsonl: str, + terms2doc_json: str, + sample_size: int = 28, + seed: int = 123, + ) -> None: """ Fit phase: Builds and caches the few-shot prompt blocks from the training files. No model training occurs (Few-Shot/In-Context Learning). """ # Build prompt block for Term extraction - _ = self.build_stratified_fewshot_prompt(train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed) + _ = self.build_stratified_fewshot_prompt( + train_docs_jsonl, terms2doc_json, sample_size=sample_size, seed=seed + ) # Build prompt block for Type extraction - _ = self.build_types_fewshot_block(train_docs_jsonl, terms2doc_json, sample_per_term=1) + _ = self.build_types_fewshot_block( + train_docs_jsonl, terms2doc_json, sample_per_term=1 + ) # ------------------------- # Inference helpers (prompt construction and output parsing) @@ -376,10 +418,18 @@ def _parse_list_like(self, raw_string: str) -> List[str]: def _call_model_one(self, prompt: str, max_new_tokens: int = 120) -> str: """Calls the underlying LocalAutoLLM for a single prompt. Returns the raw tail output.""" # self.model is an instance of LocalAutoLLM - model_output = self.model.generate([prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0) + model_output = self.model.generate( + [prompt], max_new_tokens=max_new_tokens, temperature=0.0, top_p=1.0 + ) return model_output[0] if model_output else "" - def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + def predict_terms( + self, + docs_test_jsonl: str, + out_jsonl: str, + max_lines: int = -1, + max_new_tokens: int = 120, + ) -> int: """ Runs Term Extraction on the test documents and saves results to a JSONL file. Returns: The count of individual terms written. @@ -388,7 +438,10 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - raise RuntimeError("Few-shot block for terms is empty. Call fit() first.") num_written_terms = 0 - with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + with ( + open(docs_test_jsonl, "r", encoding="utf-8") as file_in, + open(out_jsonl, "w", encoding="utf-8") as file_out, + ): for line_index, line in enumerate(file_in, start=1): if 0 < max_lines < line_index: break @@ -396,7 +449,7 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - try: document = json.loads(line.strip()) except Exception: - continue # Skip malformed JSON lines + continue # Skip malformed JSON lines doc_id = document.get("id", "unknown") title = document.get("title", "") @@ -410,7 +463,10 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - # Write extracted terms for term_or_type in predicted_terms: if isinstance(term_or_type, str) and term_or_type.strip(): - file_out.write(json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + "\n") + file_out.write( + json.dumps({"doc_id": doc_id, "term": term_or_type.strip()}) + + "\n" + ) num_written_terms += 1 # Lightweight memory management for long runs @@ -421,7 +477,13 @@ def predict_terms(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - return num_written_terms - def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = -1, max_new_tokens: int = 120) -> int: + def predict_types( + self, + docs_test_jsonl: str, + out_jsonl: str, + max_lines: int = -1, + max_new_tokens: int = 120, + ) -> int: """ Runs Type Extraction on the test documents and saves results to a JSONL file. Returns: The count of individual types written. @@ -430,7 +492,10 @@ def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - raise RuntimeError("Few-shot block for types is empty. Call fit() first.") num_written_types = 0 - with open(docs_test_jsonl, "r", encoding="utf-8") as file_in, open(out_jsonl, "w", encoding="utf-8") as file_out: + with ( + open(docs_test_jsonl, "r", encoding="utf-8") as file_in, + open(out_jsonl, "w", encoding="utf-8") as file_out, + ): for line_index, line in enumerate(file_in, start=1): if 0 < max_lines < line_index: break @@ -438,7 +503,7 @@ def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - try: document = json.loads(line.strip()) except Exception: - continue # Skip malformed JSON lines + continue # Skip malformed JSON lines doc_id = document.get("id", "unknown") title = document.get("title", "") @@ -452,7 +517,10 @@ def predict_types(self, docs_test_jsonl: str, out_jsonl: str, max_lines: int = - # Write extracted types for term_or_type in predicted_types: if isinstance(term_or_type, str) and term_or_type.strip(): - file_out.write(json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + "\n") + file_out.write( + json.dumps({"doc_id": doc_id, "type": term_or_type.strip()}) + + "\n" + ) num_written_types += 1 if line_index % 50 == 0: @@ -475,7 +543,9 @@ def load_gold_pairs(self, terms2doc_path: str) -> Set[Tuple[str, str]]: gold_pairs.add((doc_id, clean_term)) return gold_pairs - def load_predicted_pairs(self, predicted_jsonl_path: str, key: str = "term") -> Set[Tuple[str, str]]: + def load_predicted_pairs( + self, predicted_jsonl_path: str, key: str = "term" + ) -> Set[Tuple[str, str]]: """Load predicted (doc_id, term/type) pairs from a JSONL file, lowercased.""" predicted_pairs = set() with open(predicted_jsonl_path, "r", encoding="utf-8") as file_handle: @@ -490,7 +560,9 @@ def load_predicted_pairs(self, predicted_jsonl_path: str, key: str = "term") -> predicted_pairs.add((doc_id, value.strip().lower())) return predicted_pairs - def evaluate_extraction_f1(self, terms2doc_path: str, predicted_jsonl: str, key: str = "term") -> float: + def evaluate_extraction_f1( + self, terms2doc_path: str, predicted_jsonl: str, key: str = "term" + ) -> float: """ Computes set-based binary Precision, Recall, and F1 score against the gold pairs. """ @@ -507,6 +579,7 @@ def evaluate_extraction_f1(self, terms2doc_path: str, predicted_jsonl: str, key: # Use scikit-learn for metric calculation from sklearn.metrics import precision_recall_fscore_support + precision, recall, f1, _ = precision_recall_fscore_support( y_true, y_pred, average="binary", zero_division=0 ) From ec2313528859e11ad28401de93a9797e4c353f2d Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Tue, 11 Nov 2025 10:00:08 +0100 Subject: [PATCH 6/7] removing changes from __init__.py files --- ontolearner/__init__.py | 24 +----------------------- ontolearner/learner/__init__.py | 9 --------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py index 5ebd3f6..a1b5d5a 100644 --- a/ontolearner/__init__.py +++ b/ontolearner/__init__.py @@ -29,18 +29,7 @@ AutoRetrieverLearner, AutoRAGLearner, StandardizedPrompting, - LabelMapper, - RWTHDBISTaxonomyLearner, - RWTHDBISTermTypingLearner, - SKHNLPZSLearner, - SKHNLPSequentialFTLearner, - SBUNLPFewShotLearner, - SBUNLPZSLearner, - SBUNLPText2OntoLearner, - AlexbekCrossAttnLearner, - AlexbekRFLearner, - AlexbekRAGLearner, - AlexbekFewShotLearner) + LabelMapper,) from ._learner import LearnerPipeline from .processor import Processor @@ -58,17 +47,6 @@ "LabelMapper", "LearnerPipeline", "Processor", - "RWTHDBISTaxonomyLearner", - "RWTHDBISTermTypingLearner", - "SKHNLPZSLearner", - "SKHNLPSequentialFTLearner", - "SBUNLPFewShotLearner", - "SBUNLPZSLearner", - "SBUNLPText2OntoLearner", - "AlexbekCrossAttnLearner", - "AlexbekRFLearner", - "AlexbekRAGLearner", - "AlexbekFewShotLearner", "data_structure", "text2onto", "ontology", diff --git a/ontolearner/learner/__init__.py b/ontolearner/learner/__init__.py index 71020e8..0baf580 100644 --- a/ontolearner/learner/__init__.py +++ b/ontolearner/learner/__init__.py @@ -17,12 +17,3 @@ from .rag import AutoRAGLearner from .prompt import StandardizedPrompting from .label_mapper import LabelMapper -from .taxonomy_discovery.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTaxonomyLearner -from .term_typing.rwthdbis import RWTHDBISSFTLearner as RWTHDBISTermTypingLearner -from .taxonomy_discovery.skhnlp import SKHNLPSequentialFTLearner, SKHNLPZSLearner -from .taxonomy_discovery.sbunlp import SBUNLPFewShotLearner -from .term_typing.sbunlp import SBUNLPZSLearner -from .text2onto import SBUNLPFewShotLearner as SBUNLPText2OntoLearner -from .taxonomy_discovery.alexbek import AlexbekCrossAttnLearner -from .term_typing.alexbek import AlexbekRFLearner, AlexbekRAGLearner -from .text2onto.alexbek import AlexbekFewShotLearner From 2d49d94e2a42c3afd49ff5ee0907be123fcc3dcc Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Tue, 11 Nov 2025 13:17:44 +0100 Subject: [PATCH 7/7] Changes removed from requirements.txt --- requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 28a92bb..3ce19f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,3 @@ sentence-transformers~=5.1.0 scikit-learn~=1.6.1 bitsandbytes~=0.45.1 mistral-common[sentencepiece]~=1.8.5 -g4f -protobuf<5 -accelerate>=0.26.0 -Levenshtein