Skip to content

Commit

Permalink
Merge pull request #245 from mlcommons/add-http-basic-auth
Browse files Browse the repository at this point in the history
Add support for basic auth
goeffthomas authored Oct 18, 2023
2 parents c703b62 + 08762a6 commit 4204c61
Showing 4 changed files with 704 additions and 1 deletion.
673 changes: 673 additions & 0 deletions datasets/world-happiness/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,673 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"column": "ml:column",
"extract": "ml:extract",
"field": "ml:field",
"fileProperty": "ml:fileProperty",
"format": "ml:format",
"includes": "ml:includes",
"isEnumeration": "ml:isEnumeration",
"jsonPath": "ml:jsonPath",
"ml": "http://mlcommons.org/schema/",
"parentField": "ml:parentField",
"path": "ml:path",
"recordSet": "ml:recordSet",
"references": "ml:references",
"regex": "ml:regex",
"repeated": "ml:repeated",
"replace": "ml:replace",
"sc": "https://schema.org/",
"separator": "ml:separator",
"source": "ml:source",
"subField": "ml:subField",
"transform": "ml:transform",
"data": { "@id": "ml:data", "@type": "@json" },
"dataType": { "@id": "ml:dataType", "@type": "@vocab" }
},
"citation": "None",
"license": "https://creativecommons.org/publicdomain/zero/1.0/",
"url": "https://www.kaggle.com/datasets/unsdsn/world-happiness",
"distribution": [
{
"contentUrl": "https://www.kaggle.com/api/v1/datasets/download/unsdsn/world-happiness",
"contentSize": "36.809 KB",
"md5": "bc1\u002B\u002BXNoIWW685QyDLewsg==",
"encodingFormat": "application/zip",
"@type": "sc:FileObject",
"name": "archive.zip",
"description": "Archive containing all the contents of the World Happiness Report dataset"
},
{
"contentUrl": "2015.csv",
"containedIn": "archive.zip",
"encodingFormat": "text/csv",
"@type": "sc:FileObject",
"name": "2015.csv",
"description": "Happiness rank and scores by country, 2015"
},
{
"contentUrl": "2016.csv",
"containedIn": "archive.zip",
"encodingFormat": "text/csv",
"@type": "sc:FileObject",
"name": "2016.csv",
"description": "Happiness rank and scores by country, 2016"
},
{
"contentUrl": "2017.csv",
"containedIn": "archive.zip",
"encodingFormat": "text/csv",
"@type": "sc:FileObject",
"name": "2017.csv",
"description": "Happiness rank and scores by country, 2017"
},
{
"contentUrl": "2018.csv",
"containedIn": "archive.zip",
"encodingFormat": "text/csv",
"@type": "sc:FileObject",
"name": "2018.csv",
"description": "Happiness rank and scores by country, 2018"
},
{
"contentUrl": "2019.csv",
"containedIn": "archive.zip",
"encodingFormat": "text/csv",
"@type": "sc:FileObject",
"name": "2019.csv",
"description": "Happiness rank and scores by country, 2019"
}
],
"recordSet": [
{
"field": [
{
"dataType": "sc:Text",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Country" }
},
"@type": "ml:Field",
"name": "Country",
"description": "Name of the country."
},
{
"dataType": "sc:Text",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Region" }
},
"@type": "ml:Field",
"name": "Region",
"description": "Region the country belongs to."
},
{
"dataType": "sc:Integer",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Happiness Rank" }
},
"@type": "ml:Field",
"name": "Happiness-Rank",
"description": "Rank of the country based on the Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Happiness Score" }
},
"@type": "ml:Field",
"name": "Happiness-Score",
"description": "A metric measured in 2015 by asking the sampled people the question: \u0022How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest.\u0022"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Standard Error" }
},
"@type": "ml:Field",
"name": "Standard-Error",
"description": "The standard error of the happiness score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Economy (GDP per Capita)" }
},
"@type": "ml:Field",
"name": "Economy--GDP-per-Capita-",
"description": "The extent to which GDP contributes to the calculation of the Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Family" }
},
"@type": "ml:Field",
"name": "Family",
"description": "The extent to which Family contributes to the calculation of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Health (Life Expectancy)" }
},
"@type": "ml:Field",
"name": "Health--Life-Expectancy-",
"description": "The extent to which Life expectancy contributed to the calculation of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Freedom" }
},
"@type": "ml:Field",
"name": "Freedom",
"description": "The extent to which Freedom contributed to the calculation of the Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Trust (Government Corruption)" }
},
"@type": "ml:Field",
"name": "Trust--Government-Corruption-",
"description": "The extent to which Perception of Corruption contributes to Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Generosity" }
},
"@type": "ml:Field",
"name": "Generosity",
"description": "The extent to which Generosity contributed to the calculation of the Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2015.csv",
"extract": { "column": "Dystopia Residual" }
},
"@type": "ml:Field",
"name": "Dystopia-Residual",
"description": "The extent to which Dystopia Residual contributed to the calculation of the Happiness Score."
}
],
"@type": "ml:RecordSet",
"name": "2015.csv_records",
"description": "Happiness rank and scores by country, 2015"
},
{
"field": [
{
"dataType": "sc:Text",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Country" }
},
"@type": "ml:Field",
"name": "Country",
"description": "Name of the country"
},
{
"dataType": "sc:Text",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Region" }
},
"@type": "ml:Field",
"name": "Region",
"description": "Region the country belongs to"
},
{
"dataType": "sc:Integer",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Happiness Rank" }
},
"@type": "ml:Field",
"name": "Happiness-Rank",
"description": "Rank of the country based on the Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Happiness Score" }
},
"@type": "ml:Field",
"name": "Happiness-Score",
"description": "A metric measured in 2016 by asking the sampled people the question: \u0022How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest\u0022"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Lower Confidence Interval" }
},
"@type": "ml:Field",
"name": "Lower-Confidence-Interval",
"description": "Lower Confidence Interval of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Upper Confidence Interval" }
},
"@type": "ml:Field",
"name": "Upper-Confidence-Interval",
"description": "Upper Confidence Interval of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Economy (GDP per Capita)" }
},
"@type": "ml:Field",
"name": "Economy--GDP-per-Capita-",
"description": "The extent to which GDP contributes to the calculation of the Happiness Score."
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Family" }
},
"@type": "ml:Field",
"name": "Family",
"description": "The extent to which Family contributes to the calculation of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Health (Life Expectancy)" }
},
"@type": "ml:Field",
"name": "Health--Life-Expectancy-",
"description": "The extent to which Life expectancy contributed to the calculation of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Freedom" }
},
"@type": "ml:Field",
"name": "Freedom",
"description": "The extent to which Freedom contributed to the calculation of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Trust (Government Corruption)" }
},
"@type": "ml:Field",
"name": "Trust--Government-Corruption-",
"description": "The extent to which Perception of Corruption contributes to Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Generosity" }
},
"@type": "ml:Field",
"name": "Generosity",
"description": "The extent to which Generosity contributed to the calculation of the Happiness Score"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2016.csv",
"extract": { "column": "Dystopia Residual" }
},
"@type": "ml:Field",
"name": "Dystopia-Residual",
"description": "The extent to which Dystopia Residual contributed to the calculation of the Happiness Score."
}
],
"@type": "ml:RecordSet",
"name": "2016.csv_records",
"description": "Happiness rank and scores by country, 2016"
},
{
"field": [
{
"dataType": "sc:Text",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Country" }
},
"@type": "ml:Field",
"name": "Country",
"description": "Country column"
},
{
"dataType": "sc:Integer",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Happiness.Rank" }
},
"@type": "ml:Field",
"name": "Happiness.Rank",
"description": "Happiness.Rank column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Happiness.Score" }
},
"@type": "ml:Field",
"name": "Happiness.Score",
"description": "A metric measured in 2016 by asking the sampled people the question: \u0022How would you rate your happiness on a scale of 0 to 10 where 10 is the happiest\u0022"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Whisker.high" }
},
"@type": "ml:Field",
"name": "Whisker.high",
"description": "Whisker.high column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Whisker.low" }
},
"@type": "ml:Field",
"name": "Whisker.low",
"description": "Whisker.low column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Economy..GDP.per.Capita." }
},
"@type": "ml:Field",
"name": "Economy..GDP.per.Capita.",
"description": "Economy..GDP.per.Capita. column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Family" }
},
"@type": "ml:Field",
"name": "Family",
"description": "Family column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Health..Life.Expectancy." }
},
"@type": "ml:Field",
"name": "Health..Life.Expectancy.",
"description": "Health..Life.Expectancy. column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Freedom" }
},
"@type": "ml:Field",
"name": "Freedom",
"description": "Freedom column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Generosity" }
},
"@type": "ml:Field",
"name": "Generosity",
"description": "Generosity column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Trust..Government.Corruption." }
},
"@type": "ml:Field",
"name": "Trust..Government.Corruption.",
"description": "Trust..Government.Corruption. column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2017.csv",
"extract": { "column": "Dystopia.Residual" }
},
"@type": "ml:Field",
"name": "Dystopia.Residual",
"description": "Dystopia.Residual column"
}
],
"@type": "ml:RecordSet",
"name": "2017.csv_records",
"description": "Happiness rank and scores by country, 2017"
},
{
"field": [
{
"dataType": "sc:Integer",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Overall rank" }
},
"@type": "ml:Field",
"name": "Overall-rank",
"description": "Overall rank column"
},
{
"dataType": "sc:Text",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Country or region" }
},
"@type": "ml:Field",
"name": "Country-or-region",
"description": "Country or region column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Score" }
},
"@type": "ml:Field",
"name": "Score",
"description": "Score column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2018.csv",
"extract": { "column": "GDP per capita" }
},
"@type": "ml:Field",
"name": "GDP-per-capita",
"description": "GDP per capita column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Social support" }
},
"@type": "ml:Field",
"name": "Social-support",
"description": "Social support column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Healthy life expectancy" }
},
"@type": "ml:Field",
"name": "Healthy-life-expectancy",
"description": "Healthy life expectancy column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Freedom to make life choices" }
},
"@type": "ml:Field",
"name": "Freedom-to-make-life-choices",
"description": "Freedom to make life choices column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Generosity" }
},
"@type": "ml:Field",
"name": "Generosity",
"description": "Generosity column"
},
{
"dataType": "sc:Text",
"source": {
"distribution": "2018.csv",
"extract": { "column": "Perceptions of corruption" }
},
"@type": "ml:Field",
"name": "Perceptions-of-corruption",
"description": "Perceptions of corruption column"
}
],
"@type": "ml:RecordSet",
"name": "2018.csv_records",
"description": "Happiness rank and scores by country, 2018"
},
{
"field": [
{
"dataType": "sc:Integer",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Overall rank" }
},
"@type": "ml:Field",
"name": "Overall-rank",
"description": "Overall rank column"
},
{
"dataType": "sc:Text",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Country or region" }
},
"@type": "ml:Field",
"name": "Country-or-region",
"description": "Country or region column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Score" }
},
"@type": "ml:Field",
"name": "Score",
"description": "Score column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "GDP per capita" }
},
"@type": "ml:Field",
"name": "GDP-per-capita",
"description": "GDP per capita column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Social support" }
},
"@type": "ml:Field",
"name": "Social-support",
"description": "Social support column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Healthy life expectancy" }
},
"@type": "ml:Field",
"name": "Healthy-life-expectancy",
"description": "Healthy life expectancy column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Freedom to make life choices" }
},
"@type": "ml:Field",
"name": "Freedom-to-make-life-choices",
"description": "Freedom to make life choices column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Generosity" }
},
"@type": "ml:Field",
"name": "Generosity",
"description": "Generosity column"
},
{
"dataType": "sc:Float",
"source": {
"distribution": "2019.csv",
"extract": { "column": "Perceptions of corruption" }
},
"@type": "ml:Field",
"name": "Perceptions-of-corruption",
"description": "Perceptions of corruption column"
}
],
"@type": "ml:RecordSet",
"name": "2019.csv_records",
"description": "Happiness rank and scores by country, 2019"
}
],
"@type": "sc:Dataset",
"name": "World-Happiness-Report",
"description": "Happiness scored according to economic production, social support, etc."
}
8 changes: 8 additions & 0 deletions python/mlcroissant/README.md
Original file line number Diff line number Diff line change
@@ -40,6 +40,14 @@ python scripts/load.py \
--num_records 10
```

### Loading a `distribution` via `git+https`

If the `encodingFormat` of a `distribution` is `git+https`, please provide the username and password by setting the `CROISSANT_GIT_USERNAME` and `CROISSANT_GIT_PASSWORD` environment variables. These will be used to construct the authentication necessary to load the distribution.

### Loading a `distribution` via HTTP with Basic Auth

If the `contentUrl` of a `distribution` requires authentication via Basic Auth, please provide the username and password by setting the `CROISSANT_BASIC_AUTH_USERNAME` and `CROISSANT_BASIC_AUTH_PASSWORD` environment variables. These will be used to construct the authentication necessary to load the distribution.

## Programmatically build JSON-LD files

You can programmatically build Croissant JSON-LD files using the Python API.
2 changes: 2 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
@@ -107,6 +107,8 @@
EXTRACT_PATH = CROISSANT_CACHE / "extract"
CROISSANT_GIT_USERNAME = "CROISSANT_GIT_USERNAME"
CROISSANT_GIT_PASSWORD = "CROISSANT_GIT_PASSWORD"
CROISSANT_BASIC_AUTH_USERNAME = "CROISSANT_BASIC_AUTH_USERNAME"
CROISSANT_BASIC_AUTH_PASSWORD = "CROISSANT_BASIC_AUTH_PASSWORD"

# Encoding formats
GIT_HTTPS_ENCODING_FORMAT = "git+https"
Original file line number Diff line number Diff line change
@@ -120,6 +120,21 @@ def extract_git_info(full_url: str) -> tuple[str, str | None]:
)


def get_basic_auth_from_env() -> tuple[str, str] | None:
"""Determines a Basic Auth tuple from the environment variables.
This method determines the username and password for the auth tuple from the
`CROISSANT_BASIC_AUTH_USERNAME` and `CROISSANT_BASIC_AUTH_PASSWORD` env variables.
Returns:
The Basic Auth tuple if the env variables are configured properly. Otherwise, it
returns None.
"""
username = os.environ.get(constants.CROISSANT_BASIC_AUTH_USERNAME)
password = os.environ.get(constants.CROISSANT_BASIC_AUTH_PASSWORD)
return None if username is None or password is None else (username, password)


@dataclasses.dataclass(frozen=True, repr=False)
class Download(Operation):
"""Downloads from a URL to the disk."""
@@ -128,7 +143,12 @@ class Download(Operation):

def _download_from_http(self, filepath: epath.Path):
content_url = self.node.content_url
response = requests.get(content_url, stream=True, timeout=10)
response = requests.get(
content_url,
stream=True,
timeout=10,
auth=get_basic_auth_from_env())
response.raise_for_status()
total = int(response.headers.get("Content-Length", 0))
with filepath.open("wb") as file, tqdm.tqdm(
desc=f"Downloading {content_url}...",

0 comments on commit 4204c61

Please sign in to comment.