-
Couldn't load subscription status.
- Fork 36
WIP glob feature #936
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
WIP glob feature #936
Changes from 7 commits
dce2c0a
8277396
397780a
edc12c3
787c8ed
9b757d5
33f7292
a5df394
452568b
26c2572
cd9d522
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| # Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. | ||
| # It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts | ||
| # Command to run python glob_feature.py -p path/to/log/file.darshan | ||
|
|
||
|
|
||
| import argparse | ||
| import pandas as pd | ||
| import difflib | ||
| import darshan | ||
| import re | ||
| import os | ||
|
|
||
|
|
||
| def generalize_filename_glob(df): | ||
| paths = df["filename_glob"].tolist() | ||
| grouped_paths = [] | ||
|
|
||
| for i in range(len(paths)): | ||
| if not grouped_paths: | ||
| grouped_paths.append((paths[i],)) | ||
| else: | ||
| is_grouped = False | ||
| for j, group in enumerate(grouped_paths): | ||
| matcher = difflib.SequenceMatcher(None, paths[i], group[0]) | ||
| similarity_ratio = matcher.ratio() | ||
| if similarity_ratio >= 0.8: | ||
| grouped_paths[j] = group + (paths[i],) | ||
| is_grouped = True | ||
| break | ||
| if not is_grouped: | ||
| grouped_paths.append((paths[i],)) | ||
|
|
||
| print("grouped paths list is", grouped_paths) | ||
|
|
||
| new_paths = [] | ||
| for group in grouped_paths: | ||
| if len(group) > 1: | ||
| common_prefix = os.path.commonprefix(group) | ||
| pattern = r"({}.*)\d(.*)".format(common_prefix) | ||
| modified_path = re.sub(pattern, r"\1\\d\2", group[0]) | ||
| new_paths.append((modified_path, len(group))) | ||
| else: | ||
| new_paths.append((group[0], 1)) | ||
|
|
||
| new_paths = [path for path in new_paths if path[0]] | ||
|
|
||
| if len(new_paths) > len(df): | ||
| new_paths = new_paths[:len(df)] | ||
|
|
||
| print("new paths are", new_paths) | ||
| return new_paths | ||
|
|
||
|
|
||
| def main(log_path, output_path): | ||
|
|
||
| report = darshan.DarshanReport(log_path) | ||
|
|
||
| df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) | ||
|
|
||
| df = df[df["filename_glob"].str.contains(r"/.*")] | ||
|
|
||
| df.reset_index(drop=True, inplace=True) # Reset the index | ||
|
|
||
|
|
||
| new_paths = generalize_filename_glob(df) | ||
| df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) | ||
| df = df.reset_index(drop=True) | ||
| df = df.sort_values(by="glob_count", ascending=False) | ||
|
|
||
| style = df.style.background_gradient(axis=0, cmap="viridis") | ||
| style.set_table_styles([ | ||
| {"selector": "", "props": [("border", "1px solid grey")]}, | ||
| {"selector": "tbody td", "props": [("border", "1px solid grey")]}, | ||
| {"selector": "th", "props": [("border", "1px solid grey")]} | ||
| ]) | ||
|
|
||
| style = style.hide_index() | ||
| html = style.render() | ||
|
||
|
|
||
| with open(output_path, "w") as html_file: | ||
| html_file.write(html) | ||
|
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") | ||
| parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") | ||
| args = parser.parse_args() | ||
| main(log_path=args.log_path, output_path=args.output_path) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| import os | ||
| import darshan | ||
| from darshan.log_utils import get_log_path | ||
| import pandas as pd | ||
| print(pd.__version__) | ||
| from pandas.testing import assert_frame_equal | ||
| import pytest | ||
| import re | ||
| print(sys.path) # Print sys.path again | ||
| import glob_feature | ||
|
|
||
| print("hello") | ||
| @pytest.mark.parametrize("log_name, expected_df", [ | ||
| # grow this with more logs... | ||
| ("e3sm_io_heatmap_only.darshan", | ||
| pd.DataFrame({"filename_glob": | ||
| # NOTE: usage of \\d or r"\d" for a literal backslash followed by "d" | ||
| ["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc", | ||
| "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], | ||
| "glob_count": [2, 1]})), | ||
| ]) | ||
|
|
||
| def test_glob_tables(tmpdir, log_name, expected_df): | ||
| print("Current working directory:", os.getcwd()) | ||
|
|
||
| # test the glob table HTML outputs for various | ||
| # log files in the logs repo (and new log files | ||
| # that you creatively design yourself) | ||
| log_path = get_log_path(log_name) | ||
| print("log path is", log_path) | ||
| with tmpdir.as_cwd(): | ||
| cwd = os.getcwd() | ||
| # TODO: you shouldn't have a hardcoded HTML filename | ||
| # like this... | ||
| outfile = os.path.join(cwd, "name_record_glob_hd5f.html") | ||
| glob_feature.main(log_path, outfile) | ||
| actual_table = pd.read_html(outfile)[0] | ||
| actual_table.drop("Unnamed: 0", axis=1, inplace=True) # Drop the "Unnamed: 0" column | ||
|
||
| print("actual table is", actual_table) | ||
| print("expected_df is", expected_df) | ||
| print("pandas version is", pd.__version__) | ||
| print("log path is", log_path) | ||
| # Compare the two DataFrames | ||
| diff = actual_table['filename_glob'].compare(expected_df['filename_glob']) | ||
|
||
| # Print the differences | ||
| print(diff) | ||
| assert_frame_equal(actual_table, expected_df) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| # Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$ | ||
| # It uses sequence matching and grouping techniques to group similar file paths together and genera$ | ||
| # Command to run python glob_feature.py -p path/to/log/file.darshan | ||
|
|
||
|
|
||
| import argparse | ||
| import pandas as pd | ||
| import difflib | ||
| import darshan | ||
| import re | ||
| import os | ||
|
|
||
|
|
||
| def generalize_filename_glob(df): | ||
| paths = df["filename_glob"].tolist() | ||
| grouped_paths = [] | ||
|
|
||
| for i in range(len(paths)): | ||
| if not grouped_paths: | ||
| grouped_paths.append((paths[i],)) | ||
| else: | ||
| is_grouped = False | ||
| for j, group in enumerate(grouped_paths): | ||
| matcher = difflib.SequenceMatcher(None, paths[i], group[0]) | ||
| similarity_ratio = matcher.ratio() | ||
| if similarity_ratio >= 0.8: | ||
| grouped_paths[j] = group + (paths[i],) | ||
| is_grouped = True | ||
| break | ||
| if not is_grouped: | ||
| grouped_paths.append((paths[i],)) | ||
|
|
||
| print("grouped paths list is", grouped_paths) | ||
|
|
||
| new_paths = [] | ||
| for group in grouped_paths: | ||
| if len(group) > 1: | ||
| common_prefix = os.path.commonprefix(group) | ||
| pattern = r"({}.*)\d(.*)".format(common_prefix) | ||
| modified_path = re.sub(pattern, r"\1\\d\2", group[0]) | ||
| new_paths.append((modified_path, len(group))) | ||
| else: | ||
| new_paths.append((group[0], 1)) | ||
|
|
||
| new_paths = [path for path in new_paths if path[0]] | ||
|
|
||
| if len(new_paths) > len(df): | ||
| new_paths = new_paths[:len(df)] | ||
|
|
||
| print("new paths are", new_paths) | ||
| return new_paths | ||
|
|
||
|
|
||
|
|
||
|
|
||
| def main(log_path, output_path): | ||
|
|
||
| report = darshan.DarshanReport(log_path) | ||
|
|
||
| df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) | ||
|
|
||
| df = df[df["filename_glob"].str.contains(r"/.*")] | ||
|
|
||
| df.reset_index(drop=True, inplace=True) # Reset the index | ||
|
|
||
|
|
||
| new_paths = generalize_filename_glob(df) | ||
| df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) | ||
| df = df.reset_index(drop=True) | ||
| df = df.sort_values(by="glob_count", ascending=False) | ||
|
|
||
|
|
||
| style = df.style.background_gradient(axis=0, cmap="viridis") | ||
| style.set_properties(subset=["glob_count"], **{"text-align": "right"}) | ||
|
|
||
| style.set_table_styles([ | ||
| {"selector": "", "props": [("border", "1px solid grey")]}, | ||
| {"selector": "tbody td", "props": [("border", "1px solid grey")]}, | ||
| {"selector": "th", "props": [("border", "1px solid grey")]} | ||
|
|
||
| ]) | ||
|
|
||
| # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile | ||
|
|
||
| html = style.to_html() #use when running pytest | ||
|
|
||
|
|
||
| with open(output_path, "w") as html_file: | ||
| html_file.write(html) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") | ||
| parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") | ||
| args = parser.parse_args() | ||
| main(log_path=args.log_path, output_path=args.output_path) | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As you pointed out on Slack, this can fail sometimes, probably depending on the
pandasversion in use. It looks like it was deprecated and removed--for more information on how you might solve this see: pandas-dev/pandas#43771You may need to check the
pandasversion string and make a decision based on that.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sample traceback:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also,
to_html()already has an argument to suppress the index, so I'm a bit confused why we'd even want to mess around with the styler at all here.Studying the code base is often helpful, i.e.,
git grep -E -i "to_html"in the root of the repo will show some helpful examples