Merge pull request #245 from mlcommons/add-http-basic-auth

Add support for basic auth
mlcommons · Oct 18, 2023 · 4204c61 · 4204c61
2 parents c703b62 + 08762a6
commit 4204c61
Showing 4 changed files with 704 additions and 1 deletion.
diff --git a/datasets/world-happiness/metadata.json b/datasets/world-happiness/metadata.json
@@ -0,0 +1,673 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "column": "ml:column",
+    "extract": "ml:extract",
+    "field": "ml:field",
+    "fileProperty": "ml:fileProperty",
+    "format": "ml:format",
+    "includes": "ml:includes",
+    "isEnumeration": "ml:isEnumeration",
+    "jsonPath": "ml:jsonPath",
+    "ml": "http://mlcommons.org/schema/",
+    "parentField": "ml:parentField",
+    "path": "ml:path",
+    "recordSet": "ml:recordSet",
+    "references": "ml:references",
+    "regex": "ml:regex",
+    "repeated": "ml:repeated",
+    "replace": "ml:replace",
+    "sc": "https://schema.org/",
+    "separator": "ml:separator",
+    "source": "ml:source",
+    "subField": "ml:subField",
+    "transform": "ml:transform",
+    "data": { "@id": "ml:data", "@type": "@json" },
+    "dataType": { "@id": "ml:dataType", "@type": "@vocab" }
+  },
+  "citation": "None",
+  "license": "https://creativecommons.org/publicdomain/zero/1.0/",
+  "url": "https://www.kaggle.com/datasets/unsdsn/world-happiness",
+  "distribution": [
+    {
+      "contentUrl": "https://www.kaggle.com/api/v1/datasets/download/unsdsn/world-happiness",
+      "contentSize": "36.809 KB",
+      "md5": "bc1\u002B\u002BXNoIWW685QyDLewsg==",
+      "encodingFormat": "application/zip",
+      "@type": "sc:FileObject",
+      "name": "archive.zip",
+      "description": "Archive containing all the contents of the World Happiness Report dataset"
+    },
+    {
+      "contentUrl": "2015.csv",
+      "containedIn": "archive.zip",
+      "encodingFormat": "text/csv",
+      "@type": "sc:FileObject",
+      "name": "2015.csv",
+      "description": "Happiness rank and scores by country, 2015"
+    },
+    {
+      "contentUrl": "2016.csv",
+      "containedIn": "archive.zip",
+      "encodingFormat": "text/csv",
+      "@type": "sc:FileObject",
+      "name": "2016.csv",
+      "description": "Happiness rank and scores by country, 2016"
+    },
+    {
+      "contentUrl": "2017.csv",
+      "containedIn": "archive.zip",
+      "encodingFormat": "text/csv",
+      "@type": "sc:FileObject",
+      "name": "2017.csv",
+      "description": "Happiness rank and scores by country, 2017"
+    },
+    {
+      "contentUrl": "2018.csv",
+      "containedIn": "archive.zip",
+      "encodingFormat": "text/csv",
+      "@type": "sc:FileObject",
+      "name": "2018.csv",
+      "description": "Happiness rank and scores by country, 2018"
+    },
+    {
+      "contentUrl": "2019.csv",
+      "containedIn": "archive.zip",
+      "encodingFormat": "text/csv",
+      "@type": "sc:FileObject",
+      "name": "2019.csv",
+      "description": "Happiness rank and scores by country, 2019"
+    }
+  ],
+  "recordSet": [
+    {
+      "field": [
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Country" }
+          },
+          "@type": "ml:Field",
+          "name": "Country",
+          "description": "Name of the country."
+        },
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Region" }
+          },
+          "@type": "ml:Field",
+          "name": "Region",
+          "description": "Region the country belongs to."
+        },
+        {
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Happiness Rank" }
+          },
+          "@type": "ml:Field",
+          "name": "Happiness-Rank",
+          "description": "Rank of the country based on the Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Happiness Score" }
+          },
+          "@type": "ml:Field",
+          "name": "Happiness-Score",
+          "description": "A metric measured in 2015 by asking the sampled people the question: \u0022How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest.\u0022"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Standard Error" }
+          },
+          "@type": "ml:Field",
+          "name": "Standard-Error",
+          "description": "The standard error of the happiness score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Economy (GDP per Capita)" }
+          },
+          "@type": "ml:Field",
+          "name": "Economy--GDP-per-Capita-",
+          "description": "The extent to which GDP contributes to the calculation of the Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Family" }
+          },
+          "@type": "ml:Field",
+          "name": "Family",
+          "description": "The extent to which Family contributes to the calculation of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Health (Life Expectancy)" }
+          },
+          "@type": "ml:Field",
+          "name": "Health--Life-Expectancy-",
+          "description": "The extent to which Life expectancy contributed to the calculation of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Freedom" }
+          },
+          "@type": "ml:Field",
+          "name": "Freedom",
+          "description": "The extent to which Freedom contributed to the calculation of the Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Trust (Government Corruption)" }
+          },
+          "@type": "ml:Field",
+          "name": "Trust--Government-Corruption-",
+          "description": "The extent to which Perception of Corruption contributes to Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Generosity" }
+          },
+          "@type": "ml:Field",
+          "name": "Generosity",
+          "description": "The extent to which Generosity contributed to the calculation of the Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2015.csv",
+            "extract": { "column": "Dystopia Residual" }
+          },
+          "@type": "ml:Field",
+          "name": "Dystopia-Residual",
+          "description": "The extent to which Dystopia Residual contributed to the calculation of the Happiness Score."
+        }
+      ],
+      "@type": "ml:RecordSet",
+      "name": "2015.csv_records",
+      "description": "Happiness rank and scores by country, 2015"
+    },
+    {
+      "field": [
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Country" }
+          },
+          "@type": "ml:Field",
+          "name": "Country",
+          "description": "Name of the country"
+        },
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Region" }
+          },
+          "@type": "ml:Field",
+          "name": "Region",
+          "description": "Region the country belongs to"
+        },
+        {
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Happiness Rank" }
+          },
+          "@type": "ml:Field",
+          "name": "Happiness-Rank",
+          "description": "Rank of the country based on the Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Happiness Score" }
+          },
+          "@type": "ml:Field",
+          "name": "Happiness-Score",
+          "description": "A metric measured in 2016 by asking the sampled people the question: \u0022How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest\u0022"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Lower Confidence Interval" }
+          },
+          "@type": "ml:Field",
+          "name": "Lower-Confidence-Interval",
+          "description": "Lower Confidence Interval of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Upper Confidence Interval" }
+          },
+          "@type": "ml:Field",
+          "name": "Upper-Confidence-Interval",
+          "description": "Upper Confidence Interval of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Economy (GDP per Capita)" }
+          },
+          "@type": "ml:Field",
+          "name": "Economy--GDP-per-Capita-",
+          "description": "The extent to which GDP contributes to the calculation of the Happiness Score."
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Family" }
+          },
+          "@type": "ml:Field",
+          "name": "Family",
+          "description": "The extent to which Family contributes to the calculation of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Health (Life Expectancy)" }
+          },
+          "@type": "ml:Field",
+          "name": "Health--Life-Expectancy-",
+          "description": "The extent to which Life expectancy contributed to the calculation of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Freedom" }
+          },
+          "@type": "ml:Field",
+          "name": "Freedom",
+          "description": "The extent to which Freedom contributed to the calculation of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Trust (Government Corruption)" }
+          },
+          "@type": "ml:Field",
+          "name": "Trust--Government-Corruption-",
+          "description": "The extent to which Perception of Corruption contributes to Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Generosity" }
+          },
+          "@type": "ml:Field",
+          "name": "Generosity",
+          "description": "The extent to which Generosity contributed to the calculation of the Happiness Score"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2016.csv",
+            "extract": { "column": "Dystopia Residual" }
+          },
+          "@type": "ml:Field",
+          "name": "Dystopia-Residual",
+          "description": "The extent to which Dystopia Residual contributed to the calculation of the Happiness Score."
+        }
+      ],
+      "@type": "ml:RecordSet",
+      "name": "2016.csv_records",
+      "description": "Happiness rank and scores by country, 2016"
+    },
+    {
+      "field": [
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Country" }
+          },
+          "@type": "ml:Field",
+          "name": "Country",
+          "description": "Country column"
+        },
+        {
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Happiness.Rank" }
+          },
+          "@type": "ml:Field",
+          "name": "Happiness.Rank",
+          "description": "Happiness.Rank column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Happiness.Score" }
+          },
+          "@type": "ml:Field",
+          "name": "Happiness.Score",
+          "description": "A metric measured in 2016 by asking the sampled people the question: \u0022How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest\u0022"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Whisker.high" }
+          },
+          "@type": "ml:Field",
+          "name": "Whisker.high",
+          "description": "Whisker.high column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Whisker.low" }
+          },
+          "@type": "ml:Field",
+          "name": "Whisker.low",
+          "description": "Whisker.low column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Economy..GDP.per.Capita." }
+          },
+          "@type": "ml:Field",
+          "name": "Economy..GDP.per.Capita.",
+          "description": "Economy..GDP.per.Capita. column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Family" }
+          },
+          "@type": "ml:Field",
+          "name": "Family",
+          "description": "Family column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Health..Life.Expectancy." }
+          },
+          "@type": "ml:Field",
+          "name": "Health..Life.Expectancy.",
+          "description": "Health..Life.Expectancy. column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Freedom" }
+          },
+          "@type": "ml:Field",
+          "name": "Freedom",
+          "description": "Freedom column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Generosity" }
+          },
+          "@type": "ml:Field",
+          "name": "Generosity",
+          "description": "Generosity column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Trust..Government.Corruption." }
+          },
+          "@type": "ml:Field",
+          "name": "Trust..Government.Corruption.",
+          "description": "Trust..Government.Corruption. column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2017.csv",
+            "extract": { "column": "Dystopia.Residual" }
+          },
+          "@type": "ml:Field",
+          "name": "Dystopia.Residual",
+          "description": "Dystopia.Residual column"
+        }
+      ],
+      "@type": "ml:RecordSet",
+      "name": "2017.csv_records",
+      "description": "Happiness rank and scores by country, 2017"
+    },
+    {
+      "field": [
+        {
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Overall rank" }
+          },
+          "@type": "ml:Field",
+          "name": "Overall-rank",
+          "description": "Overall rank column"
+        },
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Country or region" }
+          },
+          "@type": "ml:Field",
+          "name": "Country-or-region",
+          "description": "Country or region column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Score" }
+          },
+          "@type": "ml:Field",
+          "name": "Score",
+          "description": "Score column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "GDP per capita" }
+          },
+          "@type": "ml:Field",
+          "name": "GDP-per-capita",
+          "description": "GDP per capita column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Social support" }
+          },
+          "@type": "ml:Field",
+          "name": "Social-support",
+          "description": "Social support column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Healthy life expectancy" }
+          },
+          "@type": "ml:Field",
+          "name": "Healthy-life-expectancy",
+          "description": "Healthy life expectancy column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Freedom to make life choices" }
+          },
+          "@type": "ml:Field",
+          "name": "Freedom-to-make-life-choices",
+          "description": "Freedom to make life choices column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Generosity" }
+          },
+          "@type": "ml:Field",
+          "name": "Generosity",
+          "description": "Generosity column"
+        },
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2018.csv",
+            "extract": { "column": "Perceptions of corruption" }
+          },
+          "@type": "ml:Field",
+          "name": "Perceptions-of-corruption",
+          "description": "Perceptions of corruption column"
+        }
+      ],
+      "@type": "ml:RecordSet",
+      "name": "2018.csv_records",
+      "description": "Happiness rank and scores by country, 2018"
+    },
+    {
+      "field": [
+        {
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Overall rank" }
+          },
+          "@type": "ml:Field",
+          "name": "Overall-rank",
+          "description": "Overall rank column"
+        },
+        {
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Country or region" }
+          },
+          "@type": "ml:Field",
+          "name": "Country-or-region",
+          "description": "Country or region column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Score" }
+          },
+          "@type": "ml:Field",
+          "name": "Score",
+          "description": "Score column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "GDP per capita" }
+          },
+          "@type": "ml:Field",
+          "name": "GDP-per-capita",
+          "description": "GDP per capita column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Social support" }
+          },
+          "@type": "ml:Field",
+          "name": "Social-support",
+          "description": "Social support column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Healthy life expectancy" }
+          },
+          "@type": "ml:Field",
+          "name": "Healthy-life-expectancy",
+          "description": "Healthy life expectancy column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Freedom to make life choices" }
+          },
+          "@type": "ml:Field",
+          "name": "Freedom-to-make-life-choices",
+          "description": "Freedom to make life choices column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Generosity" }
+          },
+          "@type": "ml:Field",
+          "name": "Generosity",
+          "description": "Generosity column"
+        },
+        {
+          "dataType": "sc:Float",
+          "source": {
+            "distribution": "2019.csv",
+            "extract": { "column": "Perceptions of corruption" }
+          },
+          "@type": "ml:Field",
+          "name": "Perceptions-of-corruption",
+          "description": "Perceptions of corruption column"
+        }
+      ],
+      "@type": "ml:RecordSet",
+      "name": "2019.csv_records",
+      "description": "Happiness rank and scores by country, 2019"
+    }
+  ],
+  "@type": "sc:Dataset",
+  "name": "World-Happiness-Report",
+  "description": "Happiness scored according to economic production, social support, etc."
+}
diff --git a/python/mlcroissant/README.md b/python/mlcroissant/README.md
@@ -40,6 +40,14 @@ python scripts/load.py \
     --num_records 10
 ```
 
+### Loading a `distribution` via `git+https`
+
+If the `encodingFormat` of a `distribution` is `git+https`, please provide the username and password by setting the `CROISSANT_GIT_USERNAME` and `CROISSANT_GIT_PASSWORD` environment variables. These will be used to construct the authentication necessary to load the distribution.
+
+### Loading a `distribution` via HTTP with Basic Auth
+
+If the `contentUrl` of a `distribution` requires authentication via Basic Auth, please provide the username and password by setting the `CROISSANT_BASIC_AUTH_USERNAME` and `CROISSANT_BASIC_AUTH_PASSWORD` environment variables. These will be used to construct the authentication necessary to load the distribution.
+
 ## Programmatically build JSON-LD files
 
 You can programmatically build Croissant JSON-LD files using the Python API.

diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py
@@ -107,6 +107,8 @@
 EXTRACT_PATH = CROISSANT_CACHE / "extract"
 CROISSANT_GIT_USERNAME = "CROISSANT_GIT_USERNAME"
 CROISSANT_GIT_PASSWORD = "CROISSANT_GIT_PASSWORD"
+CROISSANT_BASIC_AUTH_USERNAME = "CROISSANT_BASIC_AUTH_USERNAME"
+CROISSANT_BASIC_AUTH_PASSWORD = "CROISSANT_BASIC_AUTH_PASSWORD"
 
 # Encoding formats
 GIT_HTTPS_ENCODING_FORMAT = "git+https"
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/download.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/download.py
@@ -120,6 +120,21 @@ def extract_git_info(full_url: str) -> tuple[str, str | None]:
         )
 
 
+def get_basic_auth_from_env() -> tuple[str, str] | None:
+    """Determines a Basic Auth tuple from the environment variables.
+
+    This method determines the username and password for the auth tuple from the
+    `CROISSANT_BASIC_AUTH_USERNAME` and `CROISSANT_BASIC_AUTH_PASSWORD` env variables.
+
+    Returns:
+        The Basic Auth tuple if the env variables are configured properly. Otherwise, it
+        returns None.
+    """
+    username = os.environ.get(constants.CROISSANT_BASIC_AUTH_USERNAME)
+    password = os.environ.get(constants.CROISSANT_BASIC_AUTH_PASSWORD)
+    return None if username is None or password is None else (username, password)
+
+
 @dataclasses.dataclass(frozen=True, repr=False)
 class Download(Operation):
     """Downloads from a URL to the disk."""
@@ -128,7 +143,12 @@ class Download(Operation):
 
     def _download_from_http(self, filepath: epath.Path):
         content_url = self.node.content_url
-        response = requests.get(content_url, stream=True, timeout=10)
+        response = requests.get(
+            content_url,
+            stream=True,
+            timeout=10,
+            auth=get_basic_auth_from_env())
+        response.raise_for_status()
         total = int(response.headers.get("Content-Length", 0))
         with filepath.open("wb") as file, tqdm.tqdm(
             desc=f"Downloading {content_url}...",