Skip to content

Commit

Permalink
feat: improve uri scheme parsing with list of available schemes from …
Browse files Browse the repository at this point in the history
…`fsspec` (#1009)

* schema comparison as lower

* recovered old test

* parse only valid schemes, direct user to valid schemes in case of error
  • Loading branch information
lobis authored Oct 25, 2023
1 parent 3c429c6 commit b4209bf
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 19 deletions.
36 changes: 26 additions & 10 deletions src/uproot/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,19 @@ def regularize_path(path):
_windows_drive_letter_ending = re.compile(r".*\b[A-Za-z]$")
_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]")
_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]")
_might_be_port = re.compile(r"^[0-9].*")
_remote_schemes = ["ROOT", "S3", "HTTP", "HTTPS"]
_schemes = ["FILE", *_remote_schemes]
_uri_scheme = re.compile("^[a-zA-Z][a-zA-Z0-9+.-]*://")

_remote_schemes = ["root", "s3", "http", "https"]
_schemes = ["file", *_remote_schemes]

try:
# TODO: remove this try/except when fsspec becomes a required dependency
import fsspec

_schemes = list({*_schemes, *fsspec.available_protocols()})
except ImportError:
pass

_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://")


def file_object_path_split(path: str) -> tuple[str, str | None]:
Expand All @@ -302,16 +311,23 @@ def file_object_path_split(path: str) -> tuple[str, str | None]:
path: str = regularize_path(path)
path = path.strip()

if _uri_scheme.match(path):
parsed_url = urlparse(path)
parts = parsed_url.path.split(":")
else:
# local file path
if "://" not in path:
# assume it's a local file path
parts = path.split(":")
if pathlib.PureWindowsPath(path).drive:
# Windows absolute path
assert len(parts) >= 2, f"could not split object from windows path {path}"
parts = [parts[0] + ":" + parts[1]] + parts[2:]
elif _uri_scheme.match(path):
# if not a local path, attempt to match a URI scheme
parsed_url = urlparse(path)
parts = parsed_url.path.split(":")
else:
# invalid scheme
scheme = path.split("://")[0]
raise ValueError(
f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}."
)

if len(parts) == 1:
obj = None
Expand Down Expand Up @@ -983,7 +999,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo

parsed_url = urlparse(file_path)

if parsed_url.scheme.upper() in _remote_schemes:
if parsed_url.scheme.lower() in _remote_schemes:
yield file_path, object_path, maybe_steps

else:
Expand Down
3 changes: 1 addition & 2 deletions tests/test_0001-source-class.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,7 @@ def test_http(use_threads):
assert [tobytes(x.raw_data) for x in chunks] == [one, two, three]


@pytest.mark.skip(reason="RECHECK: example.com is flaky, too")
def colons_and_ports():
def test_colons_and_ports():
assert uproot._util.file_object_path_split("https://example.com:443") == (
"https://example.com:443",
None,
Expand Down
18 changes: 11 additions & 7 deletions tests/test_0976_path_object_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,21 @@
None,
),
),
(
"local/file.root://Events",
(
"local/file.root",
"//Events",
),
),
],
)
def test_url_split(input_value, expected_output):
url, obj = uproot._util.file_object_path_split(input_value)
url_expected, obj_expected = expected_output
assert url == url_expected
assert obj == obj_expected


@pytest.mark.parametrize(
"input_value",
[
"local/file.root://Events",
],
)
def test_url_split_invalid(input_value):
with pytest.raises(ValueError):
uproot._util.file_object_path_split(input_value)

0 comments on commit b4209bf

Please sign in to comment.