Skip to content

Commit

Permalink
Make SEG-Y endianness and spec inference more robust and flexible (#223)
Browse files Browse the repository at this point in the history
* add endianness and more codes which also removes mapping and bidict requirement

* backward compatibility for python <3.12

* Remove unused sample fields from config.py

The 'samples_per_trace' and 'sample_interval' fields were removed from `src/segy/config.py`. These fields are no longer needed and their removal simplifies the configuration handling.

* Refactor SEG-Y file endianness and revision inference

Rework the logic for inferring the endianness and revision of SEG-Y files, introducing explicit handling for different cases and additional settings-based overrides. This update simplifies control flow and improves the clarity and maintainability of the code.

* Remove native endianness enum

* Fix segy_revision_minor calculation in factory.py

* Remove 'sample_format' from SEG-Y file __slots__.

* fix code based on static type analysis

* Refactor spec initialization in SEGY file handling

Extract spec initialization into a separate method `_initialize_spec` to simplify the code structure and improve readability. This refactoring ensures centralized logic for setting the SEG-Y standard and endianness, reducing redundancy and potential errors.

* update locked dependencies

* Update error message for endianness inference failure

* Add SegyStandard.REV2 to test parameters

---------

Co-authored-by: Altay Sansal <[email protected]>
  • Loading branch information
tasansal and Altay Sansal authored Nov 19, 2024
1 parent ff7321d commit b5885c9
Show file tree
Hide file tree
Showing 10 changed files with 1,511 additions and 1,310 deletions.
2,400 changes: 1,238 additions & 1,162 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ pydantic = "^2.9.0"
pydantic-settings = "^2.4.0"
numba = ">=0.59.1, <0.70.0"
pandas = "^2.2.2"
bidict = "^0.23.1"
typer = "^0.12.5"
rapidfuzz = "^3.9.7"
gcsfs = { version = ">=2024.9.0.post1", optional = true }
Expand Down
6 changes: 0 additions & 6 deletions src/segy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ class BinaryHeaderSettings(SegyBaseSettings):
the binary header in the actual file.
"""

samples_per_trace: int | None = Field(
default=None, description="Override samples per trace."
)
sample_interval: int | None = Field(
default=None, description="Override sample interval."
)
ext_text_header: int | None = Field(
default=None, description="Override extended text headers."
)
Expand Down
7 changes: 4 additions & 3 deletions src/segy/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from segy.schema.base import Endianness
from segy.schema.format import ScalarType
from segy.schema.segy import SegyStandard
from segy.standards.mapping import SEGY_FORMAT_MAP
from segy.standards.codes import DataSampleFormatCode
from segy.transforms import TransformFactory
from segy.transforms import TransformPipeline

Expand Down Expand Up @@ -155,14 +155,15 @@ def create_binary_header(self, update: dict[str, Any] | None = None) -> bytes:
bin_header["segy_revision"] = REV1_BASE16
elif self.segy_revision >= SegyStandard.REV2:
minor, major = np.modf(self.segy_revision.value)
minor = int(minor * 10) # fraction to int
bin_header["segy_revision_major"] = major
bin_header["segy_revision_minor"] = minor

bin_header["sample_interval"] = self.sample_interval
bin_header["orig_sample_interval"] = self.sample_interval
bin_header["samples_per_trace"] = self.samples_per_trace
bin_header["orig_samples_per_trace"] = self.samples_per_trace
bin_header["data_sample_format"] = SEGY_FORMAT_MAP[self.sample_format]
bin_header["data_sample_format"] = DataSampleFormatCode[self.sample_format.name]

if update is not None:
for key, value in update.items():
Expand All @@ -183,7 +184,7 @@ def create_trace_header_template(
Array containing the trace header template.
"""
trace_header_spec = self.spec.trace.header
dtype = trace_header_spec.dtype.newbyteorder(Endianness.NATIVE.symbol)
dtype = trace_header_spec.dtype.newbyteorder("=")

header_template = HeaderArray(np.zeros(shape=size, dtype=dtype))

Expand Down
206 changes: 135 additions & 71 deletions src/segy/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from __future__ import annotations

import sys
from dataclasses import dataclass
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING
from typing import cast
Expand All @@ -13,98 +15,155 @@
from segy.accessors import TraceAccessor
from segy.arrays import HeaderArray
from segy.config import SegySettings
from segy.constants import REV1_BASE16
from segy.exceptions import EndiannessInferenceError
from segy.indexing import DataIndexer
from segy.indexing import HeaderIndexer
from segy.indexing import TraceIndexer
from segy.schema import Endianness
from segy.schema import ScalarType
from segy.schema import SegyStandard
from segy.standards import get_segy_standard
from segy.standards.mapping import SEGY_FORMAT_MAP
from segy.standards.codes import DataSampleFormatCode
from segy.standards.codes import SegyEndianCode
from segy.transforms import TransformFactory
from segy.transforms import TransformPipeline

if TYPE_CHECKING:
from fsspec import AbstractFileSystem
from numpy.typing import DTypeLike
from numpy.typing import NDArray

from segy.indexing import AbstractIndexer
from segy.schema import SegySpec


@dataclass
class SegyScanResult:
class SegyInferResult:
"""A scan result of a SEG-Y file.
Attributes:
endianness: Endianness of the file.
revision: SEG-Y revision as float.
sample_format: SEG-Y sample format.
"""

__slots__ = ("endianness", "revision", "sample_format")
__slots__ = ("endianness", "revision")

endianness: Endianness
revision: float
sample_format: ScalarType


def infer_endianness(
fs: AbstractFileSystem,
url: str,
spec: SegySpec,
) -> SegyScanResult:
"""Infer endianness of a binary header buffer given header spec.
class EndiannessAction(Enum):
"""Descriptive flag enum for endianness reversal."""

REVERSE = True
KEEP = False

The buffer length and binary header spec itemsize must be the same.

def infer_endianness(
buffer: bytes,
settings: SegySettings,
) -> EndiannessAction:
"""Infer if we need to reverse the endianness of the seismic data.
Args:
fs: FSSpec filesystem instance.
url: Path to the SEG-Y file.
spec: SEG-Y spec containing how to parse binary header.
buffer: Bytes representing the binary header.
settings: Settings instance to configure / override.
Returns:
A SegyScanResult instance filled with inferred endianness, revision, and format.
A boolean indicating if the endianness need to be reversed.
Raises:
EndiannessInferenceError: When inference fails.
"""
bin_spec = spec.binary_header.model_copy(deep=True) # we will mutate, so copy
buffer = fs.read_block(url, offset=bin_spec.offset, length=bin_spec.itemsize)

for endianness in [Endianness.BIG, Endianness.LITTLE]:
bin_spec.endianness = endianness
bin_hdr = np.frombuffer(buffer, dtype=bin_spec.dtype)
# Use endianness from settings if provided
if settings.endianness is not None:
return EndiannessAction(settings.endianness != sys.byteorder)

# Define offsets and data types
endian_offset = 96
format_offset = 24
endian_dtype = np.dtype("uint32")
format_dtype = np.dtype("uint16")
supported_formats = set(DataSampleFormatCode._value2member_map_.keys())

# Attempt to read explicit endianness code (SEGY Rev2+)
endian_code = np.frombuffer(buffer, endian_dtype, offset=endian_offset, count=1)[0]

if endian_code == SegyEndianCode.NATIVE:
return EndiannessAction.KEEP
if endian_code == SegyEndianCode.REVERSE:
return EndiannessAction.REVERSE
if endian_code == SegyEndianCode.PAIRWISE_SWAP:
msg = "File bytes are pairwise swapped. Currently not supported."
raise NotImplementedError(msg)
if endian_code != 0:
msg = (
f"Explicit endianness code has ambiguous value: {endian_code}. "
"Expected one of {{16909060, 67305985, 33620995}} at byte 97. "
"Provide endianness using SegyFileSettings or SegySpec."
)
raise EndiannessInferenceError(msg)

revision = bin_hdr["segy_revision"].item() / REV1_BASE16
sample_increment = bin_hdr["sample_interval"].item()
sample_format_int = bin_hdr["data_sample_format"].item()
# Legacy method for SEGY Rev <2.0
def check_format_code(dtype: DTypeLike) -> bool:
format_value = np.frombuffer(buffer, dtype, offset=format_offset, count=1)[0]
return format_value in supported_formats

# Validate the inferred values.
in_spec = revision in {0.0, 1.0, 2.0}
increment_is_positive = sample_increment > 0
format_is_valid = sample_format_int in SEGY_FORMAT_MAP.values()
# Check with native machine endianness
if check_format_code(format_dtype):
return EndiannessAction.KEEP

if in_spec and increment_is_positive and format_is_valid:
sample_format = SEGY_FORMAT_MAP.inverse[sample_format_int]
return SegyScanResult(endianness, revision, sample_format)
# Check with reverse machine endianness
if check_format_code(format_dtype.newbyteorder()):
return EndiannessAction.REVERSE

# Inference failed
msg = (
f"Can't infer file endianness, please specify manually. "
f"Detected {revision=}, {sample_increment=}, and {sample_format_int=}."
"Cannot automatically infer file endianness using explicit or legacy "
"methods. Please provide it using SegyFileSettings or SegySpec."
)
raise EndiannessInferenceError(msg)


def infer_spec(fs: AbstractFileSystem, url: str) -> SegySpec:
"""Try to infer SEG-Y file revision and endianness to build a SegySpec."""
spec = get_segy_standard(1.0)
scan_result = infer_endianness(fs, url, spec)
new_spec = get_segy_standard(scan_result.revision)
new_spec.trace.data.format = scan_result.sample_format
new_spec.endianness = scan_result.endianness
return new_spec
def infer_revision(
buffer: bytes,
endianness_action: EndiannessAction,
settings: SegySettings,
) -> int | float:
"""Infer the revision number from the binary header of a SEG-Y file.
Args:
buffer: The binary header buffer.
endianness_action: The action to take for endianness.
settings: The SegySettings, which may override the revision.
Returns:
The revision number as a float (e.g., 1.0, 1.2, 2.0).
"""
# Handle override from settings
if settings.binary.revision is not None:
return settings.binary.revision

# Rev2+ defines major and minor version as 1-byte fields. Read major first.
major_dtype, minor_dtype = np.dtype("uint8"), np.dtype("uint8")
revision_major = np.frombuffer(buffer, major_dtype, offset=300, count=1)[0]

# If major is 2, read remaining (minor)
if revision_major >= SegyStandard.REV2:
revision_minor = np.frombuffer(buffer, minor_dtype, offset=301, count=1)[0]

# Legacy (Revision <2.0) fallback
# Read revision from 2-byte field with correct endian
else:
revision_dtype = np.dtype("uint16")
if endianness_action == EndiannessAction.REVERSE:
revision_dtype = revision_dtype.newbyteorder()

revision = np.frombuffer(buffer, revision_dtype, offset=300, count=1)[0]
revision_major = revision >> 8
revision_minor = revision & 0xFF

return int(revision_major) + int(revision_minor) / 10


class SegyFile:
Expand Down Expand Up @@ -133,35 +192,9 @@ def __init__(
self.fs, self.url = url_to_fs(url, **self.settings.storage_options)
self._info = self.fs.info(self.url)

# Spec setting overrides.
# The control flow here is terrible; needs a refactor.
if self.settings.binary.revision is not None:
self.spec = get_segy_standard(self.settings.binary.revision)

# Override/Infer endianness
if self.settings.endianness is None:
scan_result = infer_endianness(self.fs, self.url, self.spec)
self.spec.endianness = scan_result.endianness
else:
self.spec.endianness = self.settings.endianness

# Default, infer if no spec provided.
elif spec is None:
self.spec = infer_spec(self.fs, self.url)

# If spec is provided set to it and update endianness if its None.
else:
self.spec = spec

# Override/Infer endianness
if self.spec.endianness is None:
if self.settings.endianness is None:
scan_result = infer_endianness(self.fs, self.url, self.spec)
self.spec.endianness = scan_result.endianness
else:
self.spec.endianness = self.settings.endianness

self.spec = self._initialize_spec(spec)
self._update_spec()

self.accessors = TraceAccessor(self.spec.trace)

@property
Expand Down Expand Up @@ -251,6 +284,33 @@ def binary_header(self) -> HeaderArray:

return HeaderArray(transforms.apply(bin_hdr))

def _initialize_spec(self, spec: SegySpec | None) -> SegySpec:
"""Initialize the spec based on the settings and/or file contents."""
if spec is None:
scan_result = self._infer_spec()
inferred_spec = get_segy_standard(scan_result.revision)
inferred_spec.endianness = scan_result.endianness
spec = inferred_spec
if spec.endianness is None:
spec.endianness = self._infer_spec().endianness
return spec

def _infer_spec(self) -> SegyInferResult:
"""Scan the file and infer the endianness and SEG-Y standard."""
bin_header_buffer = self.fs.read_block(fn=self.url, offset=3200, length=400)
endianness_action = infer_endianness(bin_header_buffer, self.settings)
revision = infer_revision(bin_header_buffer, endianness_action, self.settings)

if endianness_action == EndiannessAction.REVERSE:
byte_order = "big" if sys.byteorder == "little" else "little"
else:
byte_order = sys.byteorder

return SegyInferResult(
endianness=Endianness(byte_order),
revision=revision,
)

def _update_spec(self) -> None:
"""Parse the binary header and apply some rules."""
if self.spec.ext_text_header is not None:
Expand All @@ -261,6 +321,10 @@ def _update_spec(self) -> None:
settings_num_ext_text = self.settings.binary.ext_text_header
self.spec.ext_text_header.count = settings_num_ext_text

sample_format_value = self.binary_header["data_sample_format"]
sample_format_code = DataSampleFormatCode(sample_format_value)
sample_format = ScalarType[sample_format_code.name]
self.spec.trace.data.format = sample_format
self.spec.trace.data.samples = self.binary_header["samples_per_trace"].item()
self.spec.trace.data.interval = self.binary_header["sample_interval"].item()

Expand Down
2 changes: 0 additions & 2 deletions src/segy/schema/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,11 @@ class Endianness(StrEnum):

BIG = "big"
LITTLE = "little"
NATIVE = "native"

def __init__(self, _: str) -> None:
self._symbol_map = {
"big": ">",
"little": "<",
"native": "=",
}

@property
Expand Down
Loading

0 comments on commit b5885c9

Please sign in to comment.