| 
3 | 3 | import inspect  | 
4 | 4 | import os  | 
5 | 5 | import subprocess  | 
 | 6 | +import tarfile  | 
6 | 7 | import warnings  | 
 | 8 | +import zipfile  | 
7 | 9 | from collections import defaultdict  | 
8 | 10 | from glob import glob  | 
9 | 11 | from io import StringIO  | 
@@ -689,3 +691,187 @@ def _object_to_dict(obj):  | 
689 | 691 |             data[key] = _object_to_dict(value)  | 
690 | 692 |         return data  | 
691 | 693 |     return obj  | 
 | 694 | + | 
 | 695 | + | 
 | 696 | +#################################################################  | 
 | 697 | + | 
 | 698 | + | 
 | 699 | +def read_archive(  | 
 | 700 | +    file_path: str,  | 
 | 701 | +    extract_to_df: bool = True,  | 
 | 702 | +    file_type: str | None = None,  | 
 | 703 | +    selected_files: list[str] | None = None,  | 
 | 704 | +) -> pd.DataFrame | list[str]:  | 
 | 705 | +    """  | 
 | 706 | +    Reads an archive file (.zip, .tar, .tar.gz) and optionally lists its content  | 
 | 707 | +    or extracts specific files into a DataFrame.  | 
 | 708 | +
  | 
 | 709 | +    Args:  | 
 | 710 | +        file_path: The path to the archive file.  | 
 | 711 | +        extract_to_df: Whether to read the contents into a DataFrame  | 
 | 712 | +            (for CSV or similar formats). Default is True.  | 
 | 713 | +        file_type: Optional file type hint ('zip', 'tar', 'tar.gz').  | 
 | 714 | +            If None, it will be inferred from the file extension.  | 
 | 715 | +        selected_files: List of files to read directly without user interaction.  | 
 | 716 | +
  | 
 | 717 | +    Returns:  | 
 | 718 | +        - A pandas DataFrame if extract_to_df is True  | 
 | 719 | +          and the user selects a file to load.  | 
 | 720 | +        - A list of dataframes that contains  | 
 | 721 | +        compatible file names in the archive otherwise.  | 
 | 722 | +    """  | 
 | 723 | +    file_type = file_type or _infer_file_type(file_path)  | 
 | 724 | + | 
 | 725 | +    if file_type == "zip":  | 
 | 726 | +        return _process_zip_archive(file_path, extract_to_df, selected_files)  | 
 | 727 | +    elif file_type in {"tar", "tar.gz"}:  | 
 | 728 | +        return _process_tar_archive(file_path, extract_to_df, selected_files)  | 
 | 729 | +    else:  | 
 | 730 | +        raise ValueError(  | 
 | 731 | +            "Unsupported archive format. Supported formats are .zip, .tar, or .tar.gz."  | 
 | 732 | +        )  | 
 | 733 | + | 
 | 734 | + | 
 | 735 | +def _process_zip_archive(  | 
 | 736 | +    file_path: str, extract_to_df: bool, selected_files: list[str] | None  | 
 | 737 | +) -> pd.DataFrame | list[str]:  | 
 | 738 | +    """Process a ZIP archive."""  | 
 | 739 | +    with zipfile.ZipFile(file_path) as archive:  | 
 | 740 | +        compatible_files = _list_compatible_files(archive.namelist())  | 
 | 741 | + | 
 | 742 | +        if extract_to_df:  | 
 | 743 | +            return _select_and_extract_from_zip(  | 
 | 744 | +                archive, compatible_files, selected_files  | 
 | 745 | +            )  | 
 | 746 | +        return compatible_files  | 
 | 747 | + | 
 | 748 | + | 
 | 749 | +def _process_tar_archive(  | 
 | 750 | +    file_path: str, extract_to_df: bool, selected_files: list[str] | None  | 
 | 751 | +) -> pd.DataFrame | list[str]:  | 
 | 752 | +    """Process a TAR archive."""  | 
 | 753 | +    mode = "r:gz" if file_path.endswith(".gz") else "r"  | 
 | 754 | +    with tarfile.open(file_path, mode) as archive:  | 
 | 755 | +        compatible_files = _list_compatible_files(archive.getnames())  | 
 | 756 | + | 
 | 757 | +        if extract_to_df:  | 
 | 758 | +            return _select_and_extract_from_tar(  | 
 | 759 | +                archive, compatible_files, selected_files  | 
 | 760 | +            )  | 
 | 761 | +        return compatible_files  | 
 | 762 | + | 
 | 763 | + | 
 | 764 | +def _select_and_extract_from_zip(  | 
 | 765 | +    archive: zipfile.ZipFile,  | 
 | 766 | +    compatible_files: list[str],  | 
 | 767 | +    selected_files: list[str] | None,  | 
 | 768 | +) -> pd.DataFrame | list[pd.DataFrame]:  | 
 | 769 | +    """Select and read specific files from a ZIP archive."""  | 
 | 770 | +    if not selected_files:  | 
 | 771 | +        selected_files = _select_files_interactively(compatible_files)  | 
 | 772 | + | 
 | 773 | +    dfs = []  | 
 | 774 | +    for selected_file in selected_files:  | 
 | 775 | +        with archive.open(selected_file) as file:  | 
 | 776 | +            if selected_file.endswith(".csv"):  | 
 | 777 | +                dfs.append(pd.read_csv(file))  | 
 | 778 | +            elif selected_file.endswith(".xlsx"):  | 
 | 779 | +                dfs.append(pd.read_excel(file))  | 
 | 780 | +    return dfs if len(dfs) > 1 else dfs[0]  | 
 | 781 | + | 
 | 782 | + | 
 | 783 | +def _select_and_extract_from_tar(  | 
 | 784 | +    archive: tarfile.TarFile,  | 
 | 785 | +    compatible_files: list[str],  | 
 | 786 | +    selected_files: list[str] | None,  | 
 | 787 | +) -> pd.DataFrame | list[pd.DataFrame]:  | 
 | 788 | +    """Select and read specific files from a TAR archive."""  | 
 | 789 | +    if not selected_files:  | 
 | 790 | +        selected_files = _select_files_interactively(compatible_files)  | 
 | 791 | + | 
 | 792 | +    dfs = []  | 
 | 793 | +    for selected_file in selected_files:  | 
 | 794 | +        member = archive.getmember(selected_file)  | 
 | 795 | +        with archive.extractfile(member) as file:  | 
 | 796 | +            if selected_file.endswith(".csv"):  | 
 | 797 | +                dfs.append(pd.read_csv(file))  | 
 | 798 | +            elif selected_file.endswith(".xlsx"):  | 
 | 799 | +                dfs.append(pd.read_excel(file))  | 
 | 800 | +    return dfs if len(dfs) > 1 else dfs[0]  | 
 | 801 | + | 
 | 802 | + | 
 | 803 | +def _select_files_interactively(compatible_files: list[str]) -> list[str]:  | 
 | 804 | +    """  | 
 | 805 | +    Allow the user to select files from a list interactively.  | 
 | 806 | +
  | 
 | 807 | +    Args:  | 
 | 808 | +        compatible_files: List of compatible file names.  | 
 | 809 | +
  | 
 | 810 | +    Returns:  | 
 | 811 | +        List of selected file names.  | 
 | 812 | +    """  | 
 | 813 | +    print("Compatible files found in the archive:")  | 
 | 814 | +    for idx, file_name in enumerate(compatible_files, 1):  | 
 | 815 | +        print(f"{idx}. {file_name}")  | 
 | 816 | + | 
 | 817 | +    selected_indices = (  | 
 | 818 | +        input(  | 
 | 819 | +            "Enter the numbers of the files to read, "  | 
 | 820 | +            "separated by commas (e.g., 1,2,3): "  | 
 | 821 | +        )  | 
 | 822 | +        .strip()  | 
 | 823 | +        .split(",")  | 
 | 824 | +    )  | 
 | 825 | +    selected_files = [  | 
 | 826 | +        compatible_files[int(idx) - 1]  | 
 | 827 | +        for idx in selected_indices  | 
 | 828 | +        if idx.strip().isdigit() and 0 < int(idx) <= len(compatible_files)  | 
 | 829 | +    ]  | 
 | 830 | +    if not selected_files:  | 
 | 831 | +        raise ValueError("No valid files selected.")  | 
 | 832 | +    return selected_files  | 
 | 833 | + | 
 | 834 | + | 
 | 835 | +def _list_compatible_files(file_names: list[str]) -> list[str]:  | 
 | 836 | +    """  | 
 | 837 | +    Helper function to list compatible files (e.g., .csv, .xlsx) from an archive.  | 
 | 838 | +
  | 
 | 839 | +    Args:  | 
 | 840 | +        file_names: List of file names in the archive.  | 
 | 841 | +
  | 
 | 842 | +    Returns:  | 
 | 843 | +        List of compatible file names.  | 
 | 844 | +    """  | 
 | 845 | +    compatible_files = [  | 
 | 846 | +        file_name  | 
 | 847 | +        for file_name in file_names  | 
 | 848 | +        if file_name.endswith((".csv", ".xlsx"))  | 
 | 849 | +    ]  | 
 | 850 | +    print("Compatible files detected :", compatible_files)  | 
 | 851 | +    if not compatible_files:  | 
 | 852 | +        raise ValueError("No compatible files found in the archive.")  | 
 | 853 | +    return compatible_files  | 
 | 854 | + | 
 | 855 | + | 
 | 856 | +def _infer_file_type(file_path: str) -> str:  | 
 | 857 | +    """  | 
 | 858 | +    Infer the type of the archive based on the file extension.  | 
 | 859 | +
  | 
 | 860 | +    Args:  | 
 | 861 | +        file_path: Path to the file.  | 
 | 862 | +
  | 
 | 863 | +    Returns:  | 
 | 864 | +        A string representing the archive type ('zip', 'tar', 'tar.gz').  | 
 | 865 | +
  | 
 | 866 | +    Raises:  | 
 | 867 | +        ValueError if the file extension is unsupported.  | 
 | 868 | +    """  | 
 | 869 | +    if file_path.endswith(".zip"):  | 
 | 870 | +        return "zip"  | 
 | 871 | +    elif file_path.endswith((".tar", ".tar.gz")):  | 
 | 872 | +        return "tar.gz" if file_path.endswith(".tar.gz") else "tar"  | 
 | 873 | +    else:  | 
 | 874 | +        raise ValueError(  | 
 | 875 | +            "Cannot infer file type from the file extension. "  | 
 | 876 | +            "Please specify the 'file_type' parameter."  | 
 | 877 | +        )  | 
0 commit comments