Skip to content

Commit

Permalink
allow anonymous uploads
Browse files Browse the repository at this point in the history
  • Loading branch information
mam10eks committed Dec 2, 2024
1 parent 7b4c908 commit f5f5525
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 23 deletions.
8 changes: 5 additions & 3 deletions application/src/tira_app/endpoints/v1/_anonymous.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from rest_framework.request import Request
from rest_framework.response import Response

from ... import model as modeldb


@api_view(["GET"])
def read_anonymous_submission(request: Request, submission_uuid: str) -> Response:
Expand All @@ -15,9 +17,9 @@ def read_anonymous_submission(request: Request, submission_uuid: str) -> Respons
Returns:
Response: The information about the anonymous submission
"""
return Response(
{"uuid": submission_uuid, "dataset_id": "clueweb09-en-trec-web-2009-20230107-training", "created": "fooo"}
)
ret = modeldb.AnonymousUploads.objects.get(uuid=submission_uuid)

return Response({"uuid": ret.uuid, "dataset_id": ret.dataset.dataset_id, "created": ret.created})


endpoints = [
Expand Down
50 changes: 42 additions & 8 deletions application/src/tira_app/endpoints/vm_api.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import json
import logging
import shutil
import tempfile
import uuid
import zipfile
from functools import wraps
from http import HTTPStatus
from pathlib import Path

from discourse_client_in_disraptor.discourse_api_client import get_disraptor_user
from django.conf import settings
from django.core.cache import cache
from django.db.utils import IntegrityError
from django.http import HttpResponseNotAllowed, JsonResponse
from django.http import HttpResponseNotAllowed, HttpResponseServerError, JsonResponse
from django.views.decorators.csrf import csrf_exempt
from grpc import RpcError, StatusCode
from markdown import markdown
from tira.check_format import _fmt, check_format

from .. import tira_model as model
from ..authentication import auth
Expand Down Expand Up @@ -495,7 +500,9 @@ def upload(request, task_id, vm_id, dataset_id, upload_id):
def anonymous_upload(request, dataset_id):
if request.method == "POST":
if not dataset_id or dataset_id is None or dataset_id == "None":
return JsonResponse({"status": 1, "message": "Please specify the associated dataset."})
return HttpResponseServerError(
json.dumps({"status": 1, "message": "Please specify the associated dataset."})
)

dataset = model.get_dataset(dataset_id)
if (
Expand All @@ -505,25 +512,52 @@ def anonymous_upload(request, dataset_id):
or "task" not in dataset
or not dataset["task"]
):
return JsonResponse({"status": 1, "message": f"Uploads are not allowed for the dataset {dataset_id}."})
return HttpResponseServerError(
json.dumps({"status": 1, "message": f"Uploads are not allowed for the dataset {dataset_id}."})
)

if dataset["is_deprecated"]:
return JsonResponse(
{"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
return HttpResponseServerError(
json.dumps(
{"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
)
)

task = model.get_task(dataset["task"], False)
if not task or not task["featured"]:
return JsonResponse(
{"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
return HttpResponseServerError(
json.dumps(
{"status": 1, "message": f"The dataset {dataset_id} is deprecated and therefore allows no uploads."}
)
)

uploaded_file = request.FILES["file"]
upload_id = str(uuid.uuid4())

result_dir = tempfile.TemporaryDirectory(prefix="tira-upload", delete=False).name
result_dir = Path(result_dir)

with open(result_dir / "upload.zip", "wb+") as destination:
for chunk in uploaded_file.chunks():
destination.write(chunk)

with zipfile.ZipFile(result_dir / "upload.zip", "r") as zip_ref:
zip_ref.extractall(result_dir / "extracted")

status_code, message = check_format(result_dir / "extracted", dataset["format"][0])

if status_code != _fmt.OK:
JsonResponse({"status": 1, "message": message}, status_code=500)
from .. import model as modeldb

(Path(settings.TIRA_ROOT) / "data" / "anonymous-uploads").mkdir(exist_ok=True, parents=True)
shutil.move(result_dir / "extracted", Path(settings.TIRA_ROOT) / "data" / "anonymous-uploads" / upload_id)
dataset = modeldb.Dataset.objects.get(dataset_id=dataset_id)
modeldb.AnonymousUploads.objects.create(uuid=upload_id, dataset=dataset)

return JsonResponse({"status": 0, "message": "ok", "uuid": upload_id})
else:
return JsonResponse({"status": 1, "message": "GET is not allowed here."})
return HttpResponseServerError(json.dumps({"status": 1, "message": "GET is not allowed here."}))


@check_permissions
Expand Down
22 changes: 22 additions & 0 deletions application/src/tira_app/migrations/0007_anonymousuploads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 5.0.9 on 2024-12-02 00:05

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("tira", "0006_dataset_ir_datasets_id_2"),
]

operations = [
migrations.CreateModel(
name="AnonymousUploads",
fields=[
("uuid", models.CharField(max_length=150, primary_key=True, serialize=False)),
("created", models.DateField(auto_now_add=True)),
("dataset", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="tira.dataset")),
],
),
]
6 changes: 6 additions & 0 deletions application/src/tira_app/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ class Upload(models.Model):
rename_to = models.TextField(default=None, null=True)


class AnonymousUploads(models.Model):
uuid = models.CharField(max_length=150, primary_key=True)
dataset = models.ForeignKey(Dataset, on_delete=models.CASCADE)
created = models.DateField(auto_now_add=True)


class DockerSoftware(models.Model):
docker_software_id = models.AutoField(primary_key=True)
vm = models.ForeignKey(VirtualMachine, on_delete=models.CASCADE)
Expand Down
9 changes: 7 additions & 2 deletions frontend/src/ClaimSubmission.vue
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

<h3>Details</h3>
<div class="py-2"></div>
<v-skeleton-loader type="card" v-if="dataset === undefined || submissionToClaim === undefined"/>
<v-skeleton-loader type="card" v-if="(dataset === undefined || submissionToClaim === undefined) && !error"/>
<div v-if="error">
No submission with ownership UUID {{ uuid }} exists.
</div>
<div v-if="dataset !== undefined && submissionToClaim !== undefined">
<p>
The run was submitted on {{ submissionToClaim.created }} to the dataset <a :href="'/datasets?query=' + dataset.dataset_id">{{ dataset.display_name }}</a>
Expand Down Expand Up @@ -72,6 +75,7 @@ export default {
uuid: '' as string,
dataset: undefined as DatasetInfo | undefined,
submissionToClaim: undefined as ClaimSubmissionInfo | undefined,
error: false,
rest_url: inject("REST base URL"),
new_software: false,
}
Expand All @@ -80,6 +84,7 @@ export default {
loadData() {
this.dataset = undefined
this.submissionToClaim = undefined
this.error = false
this.$router.push({ path: '/claim-submission/' + this.uuid})
get(this.rest_url + '/v1/anonymous/' + this.uuid)
Expand All @@ -88,7 +93,7 @@ export default {
if (this.submissionToClaim && this.submissionToClaim.dataset_id) {
get(this.rest_url + '/v1/datasets/view/' + this.submissionToClaim.dataset_id).then((i) => this.dataset = i as DatasetInfo)
}
})
}).catch(() => { this.error = true })
},
},
Expand Down
73 changes: 66 additions & 7 deletions python-client/tira/rest_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import logging
import os
import tempfile
import time
import zipfile
from functools import lru_cache
Expand All @@ -15,6 +16,7 @@
import requests
from tqdm import tqdm

from tira.check_format import check_format
from tira.local_execution_integration import LocalExecutionIntegration
from tira.pandas_integration import PandasIntegration
from tira.profiling_integration import ProfilingIntegration
Expand Down Expand Up @@ -130,7 +132,7 @@ def get_dataset(self, dataset) -> dict:
dict: The TIRA representation of the dataset.
"""

dataset_identifier = dataset
dataset_identifier = self._TiraClient__extract_dataset_identifier(dataset)
datasets = self.archived_json_response("/v1/datasets/all")

ret = self._TiraClient__matching_dataset(datasets, dataset_identifier)
Expand Down Expand Up @@ -517,17 +519,29 @@ def download_dataset(self, task, dataset, truth_dataset=False):
if "/" in dataset:
dataset = dataset.split("/")[-1]

dataset = dataset_ir_redirects(dataset)
meta_data = self.get_dataset(f"{task}/{dataset}")
data_type = "training" if dataset.endswith("-training") else "test"
suffix = "inputs" if not truth_dataset else "truths"
url = None
if (
not meta_data
or "mirrors" not in meta_data
or suffix not in meta_data["mirrors"]
or not meta_data["mirrors"][suffix]
):
dataset = dataset_ir_redirects(dataset)
else:
url = list(meta_data["mirrors"][suffix].values())[0]

target_dir = f"{self.tira_cache_dir}/extracted_datasets/{task}/{dataset}/"
suffix = "input-data" if not truth_dataset else "truth-data"
if os.path.isdir(target_dir + suffix):
return target_dir + suffix
data_type = "training" if dataset.endswith("-training") else "test"
self.download_and_extract_zip(
f'{self.base_url}/data-download/{data_type}/input-{("" if not truth_dataset else "truth")}/{dataset}.zip',
target_dir,
)

if not url:
url = f'{self.base_url}/data-download/{data_type}/input-{("" if not truth_dataset else "truth")}/{dataset}.zip'

self.download_and_extract_zip(url, target_dir)

os.rename(target_dir + f"/{dataset}", target_dir + suffix)

Expand Down Expand Up @@ -743,6 +757,51 @@ def create_upload_group(self, task_id: str, vm_id: str, display_name: str) -> Op
logging.debug(f"Created new upload with id {ret['upload']}")
return ret["upload"]

def upload_run_anonymous(self, file_path: Path, dataset_id: str):
upload_to_tira = self.get_dataset(dataset_id)

if isinstance(file_path, str):
file_path = Path(file_path)

# TODO use format from upload_to_tira instead of hard-coded run.txt
check_format(file_path, "run.txt")

zip_file = tempfile.TemporaryDirectory(prefix="tira-upload", delete=False).name
zip_file = Path(zip_file)
zip_file.mkdir(parents=True, exist_ok=True)
zip_file = zip_file / "tira-upload.zip"

zf = zipfile.ZipFile(zip_file, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9)
for root, _, files in os.walk(file_path):
for name in files:
filePath = os.path.join(root, name)
zf.write(filePath, arcname=name)

zf.close()
headers = {"Accept": "application/json"}
files = {"file": open(zip_file, "rb")}

resp = requests.post(
url=f"{self.base_url}/api/v1/anonymous-uploads/{upload_to_tira['dataset_id']}",
files=files,
headers=headers,
verify=False,
)

if resp.status_code not in {200, 202}:
message = resp.content.decode()
try:
message = json.loads(message)
message = message["message"]
except:
pass
message = f"Failed to upload to TIRA, got statuscode {resp.status_code}. Details: {message}"
print(message)
raise ValueError(message)

resp = resp.json()
print(f'Run uploaded to TIRA. Claim ownership via: {self.base_url}/claim-submission/{resp["uuid"]}')

def upload_run(
self,
file_path: Path,
Expand Down
22 changes: 19 additions & 3 deletions python-client/tira/third_party_integrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def register_rerank_data_to_ir_datasets(path_to_rerank_file, ir_dataset_id, orig
register_dataset_from_re_rank_file(ir_dataset_id, default_input, original_ir_datasets_id)


def persist_and_normalize_run(run, system_name, default_output=None, output_file=None, depth=1000):
def persist_and_normalize_run(run, system_name, default_output=None, output_file=None, depth=1000, upload_to_tira=None):
if output_file is None and default_output is None:
print(
'I use the environment variable "TIRA_OUTPUT_DIR" to determine where I should store the run file using "."'
Expand All @@ -145,8 +145,21 @@ def persist_and_normalize_run(run, system_name, default_output=None, output_file

if not output_file.endswith("run.txt"):
output_file = output_file + "/run.txt"
if upload_to_tira and not in_tira_sandbox():
from tira.rest_api_client import Client as RestClient

tira = RestClient()
upload_to_tira = tira.get_dataset(upload_to_tira)
else:
upload_to_tira = None

if upload_to_tira and tira:
output_file = output_file + ".gz"
normalize_run(run, system_name, depth).to_csv(output_file, sep=" ", header=False, index=False)
print(f'Done. run file is stored under "{output_file}".')
if upload_to_tira and tira:
output_file = Path(output_file).parent
tira.upload_run_anonymous(output_file, upload_to_tira["dataset_id"])


def normalize_run(run, system_name, depth=1000):
Expand Down Expand Up @@ -329,14 +342,17 @@ def extract_previous_stages_from_docker_image(image: str, command: str = None):
return extract_previous_stages_from_notebook(Path(local_file))


def in_tira_sandbox():
return "TIRA_INPUT_DATASET" in os.environ


def load_ir_datasets():
try:
from ir_datasets.datasets.base import Dataset # noqa: F401
except Exception:
return None

# Detect if we are in the TIRA sandbox
if "TIRA_INPUT_DATASET" in os.environ:
if in_tira_sandbox():
from tira.ir_datasets_util import static_ir_dataset

if os.path.isfile(os.path.join(os.environ["TIRA_INPUT_DATASET"], "rerank.jsonl.gz")) or os.path.isfile(
Expand Down
15 changes: 15 additions & 0 deletions python-client/tira/tira_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,21 @@ def submit_run(self, task_id: str, vm_id: str, dataset_id: str, upload_id: str,
"""
pass

def __extract_dataset_identifier(self, dataset: any):
"""Extract the dataset identifier from a passed object.
Args:
dataset (any): Some representation of dataset.
Returns:
Optional[dict]: The dataset identifier if available.
"""
if hasattr(dataset, "irds_ref"):
return self.__extract_dataset_identifier(dataset.irds_ref())
if hasattr(dataset, "dataset_id"):
return dataset.dataset_id()
return dataset

def __matching_dataset(self, datasets, dataset_identifier) -> "Optional[dict]":
"""Find the dataset identified by the passed dataset_identifier in all passed datasets.
Expand Down

0 comments on commit f5f5525

Please sign in to comment.