Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
# Command to run python glob_feature.py -p path/to/log/file.darshan


import argparse
import pandas as pd
import difflib
import darshan
import re
import os


def generalize_filename_glob(df):
paths = df["filename_glob"].tolist()
grouped_paths = []

for i in range(len(paths)):
if not grouped_paths:
grouped_paths.append((paths[i],))
else:
is_grouped = False
for j, group in enumerate(grouped_paths):
matcher = difflib.SequenceMatcher(None, paths[i], group[0])
similarity_ratio = matcher.ratio()
if similarity_ratio >= 0.8:
grouped_paths[j] = group + (paths[i],)
is_grouped = True
break
if not is_grouped:
grouped_paths.append((paths[i],))

print("grouped paths list is", grouped_paths)

new_paths = []
for group in grouped_paths:
if len(group) > 1:
common_prefix = os.path.commonprefix(group)
pattern = r"({}.*)\d(.*)".format(common_prefix)
modified_path = re.sub(pattern, r"\1\\d\2", group[0])
new_paths.append((modified_path, len(group)))
else:
new_paths.append((group[0], 1))

new_paths = [path for path in new_paths if path[0]]

if len(new_paths) > len(df):
new_paths = new_paths[:len(df)]

print("new paths are", new_paths)
return new_paths


def main(log_path, output_path):

report = darshan.DarshanReport(log_path)

df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])

df = df[df["filename_glob"].str.contains(r"/.*")]

df.reset_index(drop=True, inplace=True) # Reset the index


new_paths = generalize_filename_glob(df)
df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
df = df.reset_index(drop=True)
df = df.sort_values(by="glob_count", ascending=False)

style = df.style.background_gradient(axis=0, cmap="viridis")
style.set_table_styles([
{"selector": "", "props": [("border", "1px solid grey")]},
{"selector": "tbody td", "props": [("border", "1px solid grey")]},
{"selector": "th", "props": [("border", "1px solid grey")]}
])

style = style.hide_index()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you pointed out on Slack, this can fail sometimes, probably depending on the pandas version in use. It looks like it was deprecated and removed--for more information on how you might solve this see: pandas-dev/pandas#43771

You may need to check the pandas version string and make a decision based on that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sample traceback:

Traceback (most recent call last):
  File "/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py", line 89, in <module>
    main(log_path=args.log_path, output_path=args.output_path)
  File "/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py", line 77, in main
    style = style.hide_index()
            ^^^^^^^^^^^^^^^^
AttributeError: 'Styler' object has no attribute 'hide_index'. Did you mean: 'hide_index_'?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, to_html() already has an argument to suppress the index, so I'm a bit confused why we'd even want to mess around with the styler at all here.

Studying the code base is often helpful, i.e., git grep -E -i "to_html" in the root of the repo will show some helpful examples

html = style.render()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is render being used here? Isn't that Jupyter notebook thing? I'm pretty sure to_html() is what we want. You may need to study the docs here a bit: https://pandas.pydata.org/docs/user_guide/style.html and https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, if I fix your hide_index issue above locally, I then get this traceback, which is another reason we'd want to see the tests running/passing before we start to iterate ourselves (to make sure it works on a basic level):

Traceback (most recent call last):
  File "/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py", line 89, in <module>
    main(log_path=args.log_path, output_path=args.output_path)
  File "/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py", line 78, in main
    html = style.render()
           ^^^^^^^^^^^^
AttributeError: 'Styler' object has no attribute 'render'. Did you mean: '_render'?


with open(output_path, "w") as html_file:
html_file.write(html)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I fix both of the above issues and then try to run your code via the Python command line route, I still get an error:

python glob_feature.py -p ~/github_projects/darshan-logs/darshan_logs/e3sm_io_heatmaps_and_dxt/e3sm_io_heatmap_only.darshan

Traceback (most recent call last):
  File "/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py", line 90, in <module>
    main(log_path=args.log_path, output_path=args.output_path)
  File "/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py", line 81, in main
    with open(output_path, "w") as html_file:
         ^^^^^^^^^^^^^^^^^^^^^^
TypeError: expected str, bytes or os.PathLike object, not NoneType

Some of the testing I mentioned a few weeks ago about handling the various output_path modalities seems to be missing? You'll want tests for the command line and module-based incantations to make sure they work as you iterate on your code.



if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
args = parser.parse_args()
main(log_path=args.log_path, output_path=args.output_path)
49 changes: 49 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_glob_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import darshan
from darshan.log_utils import get_log_path
import pandas as pd
print(pd.__version__)
from pandas.testing import assert_frame_equal
import pytest
import re
print(sys.path) # Print sys.path again
import glob_feature

print("hello")
@pytest.mark.parametrize("log_name, expected_df", [
# grow this with more logs...
("e3sm_io_heatmap_only.darshan",
pd.DataFrame({"filename_glob":
# NOTE: usage of \\d or r"\d" for a literal backslash followed by "d"
["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc",
"/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
"glob_count": [2, 1]})),
])

def test_glob_tables(tmpdir, log_name, expected_df):
print("Current working directory:", os.getcwd())

# test the glob table HTML outputs for various
# log files in the logs repo (and new log files
# that you creatively design yourself)
log_path = get_log_path(log_name)
print("log path is", log_path)
with tmpdir.as_cwd():
cwd = os.getcwd()
# TODO: you shouldn't have a hardcoded HTML filename
# like this...
outfile = os.path.join(cwd, "name_record_glob_hd5f.html")
glob_feature.main(log_path, outfile)
actual_table = pd.read_html(outfile)[0]
actual_table.drop("Unnamed: 0", axis=1, inplace=True) # Drop the "Unnamed: 0" column
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you doing this? If there's a problem in the actual table, there's a problem in your actual code, and you should fix the code, not conceal the problem by mutating the test?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My guess would be the index column perhaps, I'm not convinced you got a full understanding of the html conversion process just yet vs. pandas versions, so maybe look into that a bit more.

print("actual table is", actual_table)
print("expected_df is", expected_df)
print("pandas version is", pd.__version__)
print("log path is", log_path)
# Compare the two DataFrames
diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is causing problems--I see this locally:

            # Compare the two DataFrames
>           diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])

/Users/treddy/github_projects/darshan/darshan-util/pydarshan/darshan/tests/test_glob_feature.py:301: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/Users/treddy/python_venvs/py_311_darshan_dev/lib/python3.11/site-packages/pandas/core/series.py:3185: in compare
    return super().compare(
/Users/treddy/python_venvs/py_311_darshan_dev/lib/python3.11/site-packages/pandas/core/generic.py:9212: in compare
    mask = ~((self == other) | (self.isna() & other.isna()))
/Users/treddy/python_venvs/py_311_darshan_dev/lib/python3.11/site-packages/pandas/core/ops/common.py:81: in new_method
    return method(self, other)
/Users/treddy/python_venvs/py_311_darshan_dev/lib/python3.11/site-packages/pandas/core/arraylike.py:40: in __eq__
    return self._cmp_method(other, operator.eq)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = 0              /home/shane/software/ior/build/testFile
1    /home/shane/software/ior/build/testFile:/Datas...
Name: filename_glob, dtype: object
other = 0    /home/shane/software/ior/build/testFile[.*]
Name: filename_glob, dtype: object, op = <built-in function eq>

    def _cmp_method(self, other, op):
        res_name = ops.get_op_result_name(self, other)
    
        if isinstance(other, Series) and not self._indexed_same(other):
>           raise ValueError("Can only compare identically-labeled Series objects")
E           ValueError: Can only compare identically-labeled Series objects

/Users/treddy/python_venvs/py_311_darshan_dev/lib/python3.11/site-packages/pandas/core/series.py:6090: ValueError

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's really no reason to have this line present anymore, the assert_frame_equal should suffice.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I delete that stuff, then I see a bunch of test failures like this:

FAILED tests/test_glob_feature.py::test_glob_tables[hdf5_diagonal_write_bytes_range_dxt.darshan-expected_df6] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[runtime_and_dxt_heatmaps_diagonal_write_only.darshan-expected_df20] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[hdf5_diagonal_write_half_flush_dxt.darshan-expected_df7] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[treddy_runtime_heatmap_inactive_ranks.darshan-expected_df21] - ValueError: empty vocabulary; perhaps the documents only contain stop words
FAILED tests/test_glob_feature.py::test_glob_tables[hdf5_diagonal_write_half_ranks_dxt.darshan-expected_df8] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[hdf5_file_opens_only.darshan-expected_df9] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[darshan-apmpi-2nodes-64mpi.darshan-expected_df2] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[hdf5_diagonal_write_1_byte_dxt.darshan-expected_df5] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[imbalanced-io.darshan-expected_df11] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan-expected_df12] - AssertionError: DataFrame are different
FAILED tests/test_glob_feature.py::test_glob_tables[shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan-expected_df13] - AssertionError: DataFrame are different

# Print the differences
print(diff)
assert_frame_equal(actual_table, expected_df)


98 changes: 98 additions & 0 deletions darshan-util/pydarshan/glob_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$
# It uses sequence matching and grouping techniques to group similar file paths together and genera$
# Command to run python glob_feature.py -p path/to/log/file.darshan


import argparse
import pandas as pd
import difflib
import darshan
import re
import os


def generalize_filename_glob(df):
paths = df["filename_glob"].tolist()
grouped_paths = []

for i in range(len(paths)):
if not grouped_paths:
grouped_paths.append((paths[i],))
else:
is_grouped = False
for j, group in enumerate(grouped_paths):
matcher = difflib.SequenceMatcher(None, paths[i], group[0])
similarity_ratio = matcher.ratio()
if similarity_ratio >= 0.8:
grouped_paths[j] = group + (paths[i],)
is_grouped = True
break
if not is_grouped:
grouped_paths.append((paths[i],))

print("grouped paths list is", grouped_paths)

new_paths = []
for group in grouped_paths:
if len(group) > 1:
common_prefix = os.path.commonprefix(group)
pattern = r"({}.*)\d(.*)".format(common_prefix)
modified_path = re.sub(pattern, r"\1\\d\2", group[0])
new_paths.append((modified_path, len(group)))
else:
new_paths.append((group[0], 1))

new_paths = [path for path in new_paths if path[0]]

if len(new_paths) > len(df):
new_paths = new_paths[:len(df)]

print("new paths are", new_paths)
return new_paths




def main(log_path, output_path):

report = darshan.DarshanReport(log_path)

df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])

df = df[df["filename_glob"].str.contains(r"/.*")]

df.reset_index(drop=True, inplace=True) # Reset the index


new_paths = generalize_filename_glob(df)
df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
df = df.reset_index(drop=True)
df = df.sort_values(by="glob_count", ascending=False)


style = df.style.background_gradient(axis=0, cmap="viridis")
style.set_properties(subset=["glob_count"], **{"text-align": "right"})

style.set_table_styles([
{"selector": "", "props": [("border", "1px solid grey")]},
{"selector": "tbody td", "props": [("border", "1px solid grey")]},
{"selector": "th", "props": [("border", "1px solid grey")]}

])

# html = style.render() # use this when running python glob_feature.py -p /path/to/logfile

html = style.to_html() #use when running pytest


with open(output_path, "w") as html_file:
html_file.write(html)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
args = parser.parse_args()
main(log_path=args.log_path, output_path=args.output_path)