allow anonymous uploads

tira-io · Dec 2, 2024 · f5f5525 · f5f5525
1 parent 7b4c908
commit f5f5525
Show file tree

Hide file tree

Showing 8 changed files with 182 additions and 23 deletions.
diff --git a/application/src/tira_app/endpoints/v1/_anonymous.py b/application/src/tira_app/endpoints/v1/_anonymous.py
@@ -3,6 +3,8 @@
 from rest_framework.request import Request
 from rest_framework.response import Response
 
+from ... import model as modeldb
+
 
 @api_view(["GET"])
 def read_anonymous_submission(request: Request, submission_uuid: str) -> Response:
@@ -15,9 +17,9 @@ def read_anonymous_submission(request: Request, submission_uuid: str) -> Respons
     Returns:
         Response: The information about the anonymous submission
     """
-    return Response(
-        {"uuid": submission_uuid, "dataset_id": "clueweb09-en-trec-web-2009-20230107-training", "created": "fooo"}
-    )
+    ret = modeldb.AnonymousUploads.objects.get(uuid=submission_uuid)
+
+    return Response({"uuid": ret.uuid, "dataset_id": ret.dataset.dataset_id, "created": ret.created})
 
 
 endpoints = [

diff --git a/application/src/tira_app/endpoints/vm_api.py b/application/src/tira_app/endpoints/vm_api.py
@@ -1,17 +1,22 @@
 import json
 import logging
+import shutil
+import tempfile
 import uuid
+import zipfile
 from functools import wraps
 from http import HTTPStatus
+from pathlib import Path
 
 from discourse_client_in_disraptor.discourse_api_client import get_disraptor_user
 from django.conf import settings
 from django.core.cache import cache
 from django.db.utils import IntegrityError
-from django.http import HttpResponseNotAllowed, JsonResponse
+from django.http import HttpResponseNotAllowed, HttpResponseServerError, JsonResponse
 from django.views.decorators.csrf import csrf_exempt
 from grpc import RpcError, StatusCode
 from markdown import markdown
+from tira.check_format import _fmt, check_format
 
 from .. import tira_model as model
 from ..authentication import auth
@@ -495,7 +500,9 @@ def upload(request, task_id, vm_id, dataset_id, upload_id):
 def anonymous_upload(request, dataset_id):
     if request.method == "POST":
         if not dataset_id or dataset_id is None or dataset_id == "None":
-            return JsonResponse({"status": 1, "message": "Please specify the associated dataset."})
+            return HttpResponseServerError(
+                json.dumps({"status": 1, "message": "Please specify the associated dataset."})
+            )
 
         dataset = model.get_dataset(dataset_id)
         if (
@@ -505,25 +512,52 @@ def anonymous_upload(request, dataset_id):
             or "task" not in dataset
             or not dataset["task"]
         ):
-            return JsonResponse({"status": 1, "message": f"Uploads are not allowed for the dataset {dataset_id}."})
+            return HttpResponseServerError(
+                json.dumps({"status": 1, "message": f"Uploads are not allowed for the dataset {dataset_id}."})
+            )
 
         if dataset["is_deprecated"]:
-            return JsonResponse(
-                {"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
+            return HttpResponseServerError(
+                json.dumps(
+                    {"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
+                )
             )
 
         task = model.get_task(dataset["task"], False)
         if not task or not task["featured"]:
-            return JsonResponse(
-                {"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
+            return HttpResponseServerError(
+                json.dumps(
+                    {"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
+                )
             )
 
         uploaded_file = request.FILES["file"]
         upload_id = str(uuid.uuid4())
 
+        result_dir = tempfile.TemporaryDirectory(prefix="tira-upload", delete=False).name
+        result_dir = Path(result_dir)
+
+        with open(result_dir / "upload.zip", "wb+") as destination:
+            for chunk in uploaded_file.chunks():
+                destination.write(chunk)
+
+        with zipfile.ZipFile(result_dir / "upload.zip", "r") as zip_ref:
+            zip_ref.extractall(result_dir / "extracted")
+
+        status_code, message = check_format(result_dir / "extracted", dataset["format"][0])
+
+        if status_code != _fmt.OK:
+            JsonResponse({"status": 1, "message": message}, status_code=500)
+        from .. import model as modeldb
+
+        (Path(settings.TIRA_ROOT) / "data" / "anonymous-uploads").mkdir(exist_ok=True, parents=True)
+        shutil.move(result_dir / "extracted", Path(settings.TIRA_ROOT) / "data" / "anonymous-uploads" / upload_id)
+        dataset = modeldb.Dataset.objects.get(dataset_id=dataset_id)
+        modeldb.AnonymousUploads.objects.create(uuid=upload_id, dataset=dataset)
+
         return JsonResponse({"status": 0, "message": "ok", "uuid": upload_id})
     else:
-        return JsonResponse({"status": 1, "message": "GET is not allowed here."})
+        return HttpResponseServerError(json.dumps({"status": 1, "message": "GET is not allowed here."}))
 
 
 @check_permissions

diff --git a/application/src/tira_app/migrations/0007_anonymousuploads.py b/application/src/tira_app/migrations/0007_anonymousuploads.py
@@ -0,0 +1,22 @@
+# Generated by Django 5.0.9 on 2024-12-02 00:05
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("tira", "0006_dataset_ir_datasets_id_2"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="AnonymousUploads",
+            fields=[
+                ("uuid", models.CharField(max_length=150, primary_key=True, serialize=False)),
+                ("created", models.DateField(auto_now_add=True)),
+                ("dataset", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="tira.dataset")),
+            ],
+        ),
+    ]
diff --git a/application/src/tira_app/model.py b/application/src/tira_app/model.py
@@ -214,6 +214,12 @@ class Upload(models.Model):
     rename_to = models.TextField(default=None, null=True)
 
 
+class AnonymousUploads(models.Model):
+    uuid = models.CharField(max_length=150, primary_key=True)
+    dataset = models.ForeignKey(Dataset, on_delete=models.CASCADE)
+    created = models.DateField(auto_now_add=True)
+
+
 class DockerSoftware(models.Model):
     docker_software_id = models.AutoField(primary_key=True)
     vm = models.ForeignKey(VirtualMachine, on_delete=models.CASCADE)

diff --git a/frontend/src/ClaimSubmission.vue b/frontend/src/ClaimSubmission.vue
@@ -16,7 +16,10 @@
 
       <h3>Details</h3>
       <div class="py-2"></div>
-      <v-skeleton-loader type="card" v-if="dataset === undefined || submissionToClaim === undefined"/>
+      <v-skeleton-loader type="card" v-if="(dataset === undefined || submissionToClaim === undefined) && !error"/>
+      <div v-if="error">
+        No submission with ownership UUID {{ uuid }} exists.
+      </div>
       <div v-if="dataset !== undefined && submissionToClaim !== undefined">
         <p>
         The run was submitted on {{ submissionToClaim.created }} to the dataset <a :href="'/datasets?query=' + dataset.dataset_id">{{ dataset.display_name }}</a>
@@ -72,6 +75,7 @@ export default {
       uuid: '' as string,
       dataset: undefined as DatasetInfo | undefined,
       submissionToClaim: undefined as ClaimSubmissionInfo | undefined,
+      error: false,
       rest_url: inject("REST base URL"),
       new_software: false,
     }
@@ -80,6 +84,7 @@ export default {
     loadData() {
       this.dataset = undefined
       this.submissionToClaim = undefined
+      this.error = false
       this.$router.push({ path: '/claim-submission/' + this.uuid})
 
       get(this.rest_url + '/v1/anonymous/' + this.uuid)
@@ -88,7 +93,7 @@ export default {
           if (this.submissionToClaim && this.submissionToClaim.dataset_id) {
             get(this.rest_url + '/v1/datasets/view/' + this.submissionToClaim.dataset_id).then((i) => this.dataset = i as DatasetInfo)
           }
-        })
+        }).catch(() => { this.error = true })
     },
   },
   

diff --git a/python-client/tira/rest_api_client.py b/python-client/tira/rest_api_client.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import tempfile
 import time
 import zipfile
 from functools import lru_cache
@@ -15,6 +16,7 @@
 import requests
 from tqdm import tqdm
 
+from tira.check_format import check_format
 from tira.local_execution_integration import LocalExecutionIntegration
 from tira.pandas_integration import PandasIntegration
 from tira.profiling_integration import ProfilingIntegration
@@ -130,7 +132,7 @@ def get_dataset(self, dataset) -> dict:
             dict: The TIRA representation of the dataset.
         """
 
-        dataset_identifier = dataset
+        dataset_identifier = self._TiraClient__extract_dataset_identifier(dataset)
         datasets = self.archived_json_response("/v1/datasets/all")
 
         ret = self._TiraClient__matching_dataset(datasets, dataset_identifier)
@@ -517,17 +519,29 @@ def download_dataset(self, task, dataset, truth_dataset=False):
         if "/" in dataset:
             dataset = dataset.split("/")[-1]
 
-        dataset = dataset_ir_redirects(dataset)
+        meta_data = self.get_dataset(f"{task}/{dataset}")
+        data_type = "training" if dataset.endswith("-training") else "test"
+        suffix = "inputs" if not truth_dataset else "truths"
+        url = None
+        if (
+            not meta_data
+            or "mirrors" not in meta_data
+            or suffix not in meta_data["mirrors"]
+            or not meta_data["mirrors"][suffix]
+        ):
+            dataset = dataset_ir_redirects(dataset)
+        else:
+            url = list(meta_data["mirrors"][suffix].values())[0]
 
         target_dir = f"{self.tira_cache_dir}/extracted_datasets/{task}/{dataset}/"
         suffix = "input-data" if not truth_dataset else "truth-data"
         if os.path.isdir(target_dir + suffix):
             return target_dir + suffix
-        data_type = "training" if dataset.endswith("-training") else "test"
-        self.download_and_extract_zip(
-            f'{self.base_url}/data-download/{data_type}/input-{("" if not truth_dataset else "truth")}/{dataset}.zip',
-            target_dir,
-        )
+
+        if not url:
+            url = f'{self.base_url}/data-download/{data_type}/input-{("" if not truth_dataset else "truth")}/{dataset}.zip'
+
+        self.download_and_extract_zip(url, target_dir)
 
         os.rename(target_dir + f"/{dataset}", target_dir + suffix)
 
@@ -743,6 +757,51 @@ def create_upload_group(self, task_id: str, vm_id: str, display_name: str) -> Op
         logging.debug(f"Created new upload with id {ret['upload']}")
         return ret["upload"]
 
+    def upload_run_anonymous(self, file_path: Path, dataset_id: str):
+        upload_to_tira = self.get_dataset(dataset_id)
+
+        if isinstance(file_path, str):
+            file_path = Path(file_path)
+
+        # TODO use format from upload_to_tira instead of hard-coded run.txt
+        check_format(file_path, "run.txt")
+
+        zip_file = tempfile.TemporaryDirectory(prefix="tira-upload", delete=False).name
+        zip_file = Path(zip_file)
+        zip_file.mkdir(parents=True, exist_ok=True)
+        zip_file = zip_file / "tira-upload.zip"
+
+        zf = zipfile.ZipFile(zip_file, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9)
+        for root, _, files in os.walk(file_path):
+            for name in files:
+                filePath = os.path.join(root, name)
+                zf.write(filePath, arcname=name)
+
+        zf.close()
+        headers = {"Accept": "application/json"}
+        files = {"file": open(zip_file, "rb")}
+
+        resp = requests.post(
+            url=f"{self.base_url}/api/v1/anonymous-uploads/{upload_to_tira['dataset_id']}",
+            files=files,
+            headers=headers,
+            verify=False,
+        )
+
+        if resp.status_code not in {200, 202}:
+            message = resp.content.decode()
+            try:
+                message = json.loads(message)
+                message = message["message"]
+            except:
+                pass
+            message = f"Failed to upload to TIRA, got statuscode {resp.status_code}. Details: {message}"
+            print(message)
+            raise ValueError(message)
+
+        resp = resp.json()
+        print(f'Run uploaded to TIRA. Claim ownership via: {self.base_url}/claim-submission/{resp["uuid"]}')
+
     def upload_run(
         self,
         file_path: Path,

diff --git a/python-client/tira/third_party_integrations.py b/python-client/tira/third_party_integrations.py
@@ -127,7 +127,7 @@ def register_rerank_data_to_ir_datasets(path_to_rerank_file, ir_dataset_id, orig
     register_dataset_from_re_rank_file(ir_dataset_id, default_input, original_ir_datasets_id)
 
 
-def persist_and_normalize_run(run, system_name, default_output=None, output_file=None, depth=1000):
+def persist_and_normalize_run(run, system_name, default_output=None, output_file=None, depth=1000, upload_to_tira=None):
     if output_file is None and default_output is None:
         print(
             'I use the environment variable "TIRA_OUTPUT_DIR" to determine where I should store the run file using "."'
@@ -145,8 +145,21 @@ def persist_and_normalize_run(run, system_name, default_output=None, output_file
 
     if not output_file.endswith("run.txt"):
         output_file = output_file + "/run.txt"
+    if upload_to_tira and not in_tira_sandbox():
+        from tira.rest_api_client import Client as RestClient
+
+        tira = RestClient()
+        upload_to_tira = tira.get_dataset(upload_to_tira)
+    else:
+        upload_to_tira = None
+
+    if upload_to_tira and tira:
+        output_file = output_file + ".gz"
     normalize_run(run, system_name, depth).to_csv(output_file, sep=" ", header=False, index=False)
     print(f'Done. run file is stored under "{output_file}".')
+    if upload_to_tira and tira:
+        output_file = Path(output_file).parent
+        tira.upload_run_anonymous(output_file, upload_to_tira["dataset_id"])
 
 
 def normalize_run(run, system_name, depth=1000):
@@ -329,14 +342,17 @@ def extract_previous_stages_from_docker_image(image: str, command: str = None):
     return extract_previous_stages_from_notebook(Path(local_file))
 
 
+def in_tira_sandbox():
+    return "TIRA_INPUT_DATASET" in os.environ
+
+
 def load_ir_datasets():
     try:
         from ir_datasets.datasets.base import Dataset  # noqa: F401
     except Exception:
         return None
 
-    # Detect if we are in the TIRA sandbox
-    if "TIRA_INPUT_DATASET" in os.environ:
+    if in_tira_sandbox():
         from tira.ir_datasets_util import static_ir_dataset
 
         if os.path.isfile(os.path.join(os.environ["TIRA_INPUT_DATASET"], "rerank.jsonl.gz")) or os.path.isfile(

diff --git a/python-client/tira/tira_client.py b/python-client/tira/tira_client.py
@@ -96,6 +96,21 @@ def submit_run(self, task_id: str, vm_id: str, dataset_id: str, upload_id: str,
         """
         pass
 
+    def __extract_dataset_identifier(self, dataset: any):
+        """Extract the dataset identifier from a passed object.
+
+        Args:
+            dataset (any): Some representation of dataset.
+
+        Returns:
+            Optional[dict]: The dataset identifier if available.
+        """
+        if hasattr(dataset, "irds_ref"):
+            return self.__extract_dataset_identifier(dataset.irds_ref())
+        if hasattr(dataset, "dataset_id"):
+            return dataset.dataset_id()
+        return dataset
+
     def __matching_dataset(self, datasets, dataset_identifier) -> "Optional[dict]":
         """Find the dataset identified by the passed dataset_identifier in all passed datasets.