diff --git a/shared/bundle_analysis/models.py b/shared/bundle_analysis/models.py index e3abde81..82a4a4b5 100644 --- a/shared/bundle_analysis/models.py +++ b/shared/bundle_analysis/models.py @@ -221,6 +221,7 @@ class Asset(Base): name = Column(types.Text, nullable=False) normalized_name = Column(types.Text, nullable=False) size = Column(types.Integer, nullable=False) + gzip_size = Column(types.Integer, nullable=False) uuid = Column(types.Text, nullable=False) asset_type = Column(SQLAlchemyEnum(AssetType)) session = relationship("Session", backref=backref("assets")) diff --git a/shared/bundle_analysis/parser.py b/shared/bundle_analysis/parser.py index 87108713..b00b58b0 100644 --- a/shared/bundle_analysis/parser.py +++ b/shared/bundle_analysis/parser.py @@ -1,336 +1,43 @@ -import json import logging -import re -import uuid -from typing import Tuple import ijson from sqlalchemy.orm import Session as DbSession -from shared.bundle_analysis.models import ( - Asset, - AssetType, - Bundle, - Chunk, - Module, - Session, - assets_chunks, - chunks_modules, -) -from shared.bundle_analysis.utils import get_extension +from shared.bundle_analysis.parsers import ParserInterface, ParserV1, ParserV2 log = logging.getLogger(__name__) +PARSER_VERSION_MAPPING = { + "1": ParserV1, + "2": ParserV2, +} + + class Parser: """ - This does a streaming JSON parse of the stats JSON file referenced by `path`. - It's more complicated that just doing a `json.loads` but should keep our memory - usage constrained. + # Retrieve bundle stats file version and return an associated instance of its parser """ - def __init__(self, db_session: DbSession): + def __init__(self, path: str, db_session: DbSession): + self.path = path self.db_session = db_session - def reset(self): - """ - Resets temporary parser state in order to parse a new file path. - """ - # chunk unique id -> asset name list - self.chunk_asset_names_index = {} - - # module name -> chunk external id list - self.module_chunk_unique_external_ids_index = {} - - # misc. top-level info from the stats data (i.e. bundler version, bundle time, etc.) - self.info = {} - - # temporary parser state - self.session = None - self.asset = None - self.chunk = None - self.chunk_asset_names = [] - self.module = None - self.module_chunk_unique_external_ids = [] - - self.asset_list = [] - self.chunk_list = [] - self.module_list = [] - - def parse(self, path: str) -> int: + def get_proper_parser(self) -> object: + error = None try: - self.reset() - - # Retrieve the info section first before parsing all the other things - # this way when an error is raised we know which bundle plugin caused it - with open(path, "rb") as f: - for event in ijson.parse(f): - self._parse_info(event) - - self.session = Session(info={}) - self.db_session.add(self.session) - self.db_session.flush() - - with open(path, "rb") as f: + with open(self.path, "rb") as f: for event in ijson.parse(f): - self._parse_event(event) - - if self.asset_list: - insert_asset = Asset.__table__.insert().values(self.asset_list) - self.db_session.execute(insert_asset) - - if self.chunk_list: - insert_chunks = Chunk.__table__.insert().values(self.chunk_list) - self.db_session.execute(insert_chunks) - - if self.module_list: - insert_modules = Module.__table__.insert().values(self.module_list) - self.db_session.execute(insert_modules) - - self.db_session.flush() - - # Delete old session/asset/chunk/module with the same bundle name if applicable - old_session = ( - self.db_session.query(Session) - .filter( - Session.bundle == self.session.bundle, - Session.id != self.session.id, - ) - .one_or_none() - ) - if old_session: - for model in [Asset, Chunk, Module]: - to_be_deleted = self.db_session.query(model).filter( - model.session == old_session - ) - for item in to_be_deleted: - self.db_session.delete(item) - self.db_session.flush() - self.db_session.delete(old_session) - self.db_session.flush() - - # save top level bundle stats info - self.session.info = json.dumps(self.info) - - # this happens last so that we could potentially handle any ordering - # of top-level keys inside the JSON (i.e. we couldn't associate a chunk - # to an asset above if we parse the chunk before the asset) - self._create_associations() - - assert self.session.bundle is not None - return self.session.id - except Exception as e: - # Inject the plugin name to the Exception object so we have visibilitity on which plugin - # is causing the trouble. - e.bundle_analysis_plugin_name = self.info.get("plugin_name", "unknown") - raise e - - def _asset_type(self, name: str) -> AssetType: - extension = get_extension(name) - - if extension in ["js"]: - return AssetType.JAVASCRIPT - if extension in ["css"]: - return AssetType.STYLESHEET - if extension in ["woff", "woff2", "ttf", "otf", "eot"]: - return AssetType.FONT - if extension in ["jpg", "jpeg", "png", "gif", "svg", "webp", "apng", "avif"]: - return AssetType.IMAGE - - return AssetType.UNKNOWN - - def _parse_info(self, event: Tuple[str, str, str]): - prefix, _, value = event - - # session info - if prefix == "version": - self.info["version"] = value - elif prefix == "bundler.name": - self.info["bundler_name"] = value - elif prefix == "bundler.version": - self.info["bundler_version"] = value - elif prefix == "builtAt": - self.info["built_at"] = value - elif prefix == "plugin.name": - self.info["plugin_name"] = value - elif prefix == "plugin.version": - self.info["plugin_version"] = value - elif prefix == "duration": - self.info["duration"] = value - - def _parse_event(self, event: Tuple[str, str, str]): - prefix, _, value = event - prefix_path = prefix.split(".") - - # asset / chunks / modules - if prefix_path[0] == "assets": - self._parse_assets_event(*event) - elif prefix_path[0] == "chunks": - self._parse_chunks_event(*event) - elif prefix_path[0] == "modules": - self._parse_modules_event(*event) - - # bundle name - elif prefix == "bundleName": - if not re.fullmatch(r"^[\w\d_:/@\.{}\[\]$-]+$", value): - log.info(f'bundle name does not match regex: "{value}"') - raise Exception("invalid bundle name") - bundle = self.db_session.query(Bundle).filter_by(name=value).first() - if bundle is None: - bundle = Bundle(name=value) - self.db_session.add(bundle) - self.session.bundle = bundle - - def _parse_assets_event(self, prefix: str, event: str, value: str): - if (prefix, event) == ("assets.item", "start_map"): - # new asset - assert self.asset is None - self.asset = Asset(session_id=self.session.id) - elif prefix == "assets.item.name": - self.asset.name = value - elif prefix == "assets.item.normalized": - self.asset.normalized_name = value - elif prefix == "assets.item.size": - self.asset.size = int(value) - elif (prefix, event) == ("assets.item", "end_map"): - self.asset_list.append( - dict( - session_id=self.asset.session_id, - name=self.asset.name, - normalized_name=self.asset.normalized_name, - size=self.asset.size, - uuid=str(uuid.uuid4()), - asset_type=self._asset_type(self.asset.name), - ) - ) - - # reset parser state - self.asset = None - - def _parse_chunks_event(self, prefix: str, event: str, value: str): - if (prefix, event) == ("chunks.item", "start_map"): - # new chunk - assert self.chunk is None - self.chunk = Chunk(session_id=self.session.id) - elif prefix == "chunks.item.id": - self.chunk.external_id = value - elif prefix == "chunks.item.uniqueId": - self.chunk.unique_external_id = value - elif prefix == "chunks.item.initial": - self.chunk.initial = value - elif prefix == "chunks.item.entry": - self.chunk.entry = value - elif prefix == "chunks.item.files.item": - self.chunk_asset_names.append(value) - elif (prefix, event) == ("chunks.item", "end_map"): - self.chunk_list.append( - dict( - session_id=self.chunk.session_id, - external_id=self.chunk.external_id, - unique_external_id=self.chunk.unique_external_id, - initial=self.chunk.initial, - entry=self.chunk.entry, - ) - ) - - self.chunk_asset_names_index[self.chunk.unique_external_id] = ( - self.chunk_asset_names - ) - # reset parser state - self.chunk = None - self.chunk_asset_names = [] - - def _parse_modules_event(self, prefix: str, event: str, value: str): - if (prefix, event) == ("modules.item", "start_map"): - # new module - assert self.module is None - self.module = Module(session_id=self.session.id) - elif prefix == "modules.item.name": - self.module.name = value - elif prefix == "modules.item.size": - self.module.size = int(value) - elif prefix == "modules.item.chunkUniqueIds.item": - self.module_chunk_unique_external_ids.append(value) - elif (prefix, event) == ("modules.item", "end_map"): - self.module_list.append( - dict( - session_id=self.module.session_id, - name=self.module.name, - size=self.module.size, - ) - ) - - self.module_chunk_unique_external_ids_index[self.module.name] = ( - self.module_chunk_unique_external_ids - ) - # reset parser state - self.module = None - self.module_chunk_unique_external_ids = [] - - def _create_associations(self): - # associate chunks to assets - inserts = [] - assets: list[Asset] = ( - self.db_session.query(Asset) - .filter( - Asset.session_id == self.session.id, - ) - .all() - ) - - asset_name_to_id = {asset.name: asset.id for asset in assets} - - chunks: list[Chunk] = ( - self.db_session.query(Chunk) - .filter( - Chunk.session_id == self.session.id, - ) - .all() - ) - - chunk_unique_id_to_id = {chunk.unique_external_id: chunk.id for chunk in chunks} - - modules = ( - self.db_session.query(Module) - .filter( - Module.session_id == self.session.id, - ) - .all() - ) - - for chunk in chunks: - chunk_id = chunk.id - asset_names = self.chunk_asset_names_index[chunk.unique_external_id] - inserts.extend( - [ - dict(asset_id=asset_name_to_id[asset_name], chunk_id=chunk_id) - for asset_name in asset_names - ] - ) - if inserts: - self.db_session.execute(assets_chunks.insert(), inserts) - - # associate modules to chunks - # FIXME: this isn't quite right - need to sort out how non-JS assets reference chunks - inserts = [] - - modules: list[Module] = self.db_session.query(Module).filter( - Module.session_id == self.session.id, - ) - for module in modules: - module_id = module.id - chunk_unique_external_ids = self.module_chunk_unique_external_ids_index[ - module.name - ] - - inserts.extend( - [ - dict( - chunk_id=chunk_unique_id_to_id[unique_external_id], - module_id=module_id, - ) - for unique_external_id in chunk_unique_external_ids - ] - ) - if inserts: - self.db_session.execute(chunks_modules.insert(), inserts) + prefix, _, value = event + if prefix == "version": + selected_parser = PARSER_VERSION_MAPPING.get(value) + if selected_parser is None: + error = f"parser not implemented for version {value}" + if not issubclass(selected_parser, ParserInterface): + error = "invalid parser implementation" + return selected_parser(self.db_session) + error = "version does not exist in bundle file" + except IOError: + error = "unable to open file" + if error: + raise Exception(f"Couldn't parse bundle: {error}") diff --git a/shared/bundle_analysis/parsers/__init__.py b/shared/bundle_analysis/parsers/__init__.py new file mode 100644 index 00000000..3842f214 --- /dev/null +++ b/shared/bundle_analysis/parsers/__init__.py @@ -0,0 +1,9 @@ +from shared.bundle_analysis.parsers.base import ParserInterface +from shared.bundle_analysis.parsers.v1 import ParserV1 +from shared.bundle_analysis.parsers.v2 import ParserV2 + +__all__ = [ + "ParserInterface", + "ParserV1", + "ParserV2", +] diff --git a/shared/bundle_analysis/parsers/base.py b/shared/bundle_analysis/parsers/base.py new file mode 100644 index 00000000..754de833 --- /dev/null +++ b/shared/bundle_analysis/parsers/base.py @@ -0,0 +1,7 @@ +import abc + + +class ParserInterface(metaclass=abc.ABCMeta): + @classmethod + def __subclasshook__(cls, subclass): + return hasattr(subclass, "parse") and callable(subclass.parse) diff --git a/shared/bundle_analysis/parsers/v1.py b/shared/bundle_analysis/parsers/v1.py new file mode 100644 index 00000000..2bc0b1e6 --- /dev/null +++ b/shared/bundle_analysis/parsers/v1.py @@ -0,0 +1,371 @@ +import json +import logging +import re +import uuid +from typing import Tuple + +import ijson +from sqlalchemy.orm import Session as DbSession + +from shared.bundle_analysis.models import ( + Asset, + AssetType, + Bundle, + Chunk, + Module, + Session, + assets_chunks, + chunks_modules, +) +from shared.bundle_analysis.utils import get_extension + +log = logging.getLogger(__name__) + + +""" +Version 1 Schema +{ + "version": "1", + "plugin": { + "name": str + "version": str + }, + "builtAt": int, + "duration": int, + "bundler": { "name": str, "version": str }, + "bundleName": str, + "assets": [{ + "name": str", + "size": int, + "normalized": str + }], + "chunks": [{ + "id": str, + "uniqueId": str, + "entry": bool, + "initial": bool, + "files": [str], + "names": [str] + }], + "modules": [{ + "name": str, + "size": int, + "chunkUniqueIds": [str] + }] +} +""" + + +class ParserV1: + """ + This does a streaming JSON parse of the stats JSON file referenced by `path`. + It's more complicated that just doing a `json.loads` but should keep our memory + usage constrained. + """ + + def __init__(self, db_session: DbSession): + self.db_session = db_session + + def reset(self): + """ + Resets temporary parser state in order to parse a new file path. + """ + # chunk unique id -> asset name list + self.chunk_asset_names_index = {} + + # module name -> chunk external id list + self.module_chunk_unique_external_ids_index = {} + + # misc. top-level info from the stats data (i.e. bundler version, bundle time, etc.) + self.info = {} + + # temporary parser state + self.session = None + self.asset = None + self.chunk = None + self.chunk_asset_names = [] + self.module = None + self.module_chunk_unique_external_ids = [] + + self.asset_list = [] + self.chunk_list = [] + self.module_list = [] + + def parse(self, path: str) -> int: + try: + self.reset() + + # Retrieve the info section first before parsing all the other things + # this way when an error is raised we know which bundle plugin caused it + with open(path, "rb") as f: + for event in ijson.parse(f): + self._parse_info(event) + + self.session = Session(info={}) + self.db_session.add(self.session) + self.db_session.flush() + + with open(path, "rb") as f: + for event in ijson.parse(f): + self._parse_event(event) + + if self.asset_list: + insert_asset = Asset.__table__.insert().values(self.asset_list) + self.db_session.execute(insert_asset) + + if self.chunk_list: + insert_chunks = Chunk.__table__.insert().values(self.chunk_list) + self.db_session.execute(insert_chunks) + + if self.module_list: + insert_modules = Module.__table__.insert().values(self.module_list) + self.db_session.execute(insert_modules) + + self.db_session.flush() + + # Delete old session/asset/chunk/module with the same bundle name if applicable + old_session = ( + self.db_session.query(Session) + .filter( + Session.bundle == self.session.bundle, + Session.id != self.session.id, + ) + .one_or_none() + ) + if old_session: + for model in [Asset, Chunk, Module]: + to_be_deleted = self.db_session.query(model).filter( + model.session == old_session + ) + for item in to_be_deleted: + self.db_session.delete(item) + self.db_session.flush() + self.db_session.delete(old_session) + self.db_session.flush() + + # save top level bundle stats info + self.session.info = json.dumps(self.info) + + # this happens last so that we could potentially handle any ordering + # of top-level keys inside the JSON (i.e. we couldn't associate a chunk + # to an asset above if we parse the chunk before the asset) + self._create_associations() + + assert self.session.bundle is not None + return self.session.id + except Exception as e: + # Inject the plugin name to the Exception object so we have visibilitity on which plugin + # is causing the trouble. + e.bundle_analysis_plugin_name = self.info.get("plugin_name", "unknown") + raise e + + def _asset_type(self, name: str) -> AssetType: + extension = get_extension(name) + + if extension in ["js"]: + return AssetType.JAVASCRIPT + if extension in ["css"]: + return AssetType.STYLESHEET + if extension in ["woff", "woff2", "ttf", "otf", "eot"]: + return AssetType.FONT + if extension in ["jpg", "jpeg", "png", "gif", "svg", "webp", "apng", "avif"]: + return AssetType.IMAGE + + return AssetType.UNKNOWN + + def _parse_info(self, event: Tuple[str, str, str]): + prefix, _, value = event + + # session info + if prefix == "version": + self.info["version"] = value + elif prefix == "bundler.name": + self.info["bundler_name"] = value + elif prefix == "bundler.version": + self.info["bundler_version"] = value + elif prefix == "builtAt": + self.info["built_at"] = value + elif prefix == "plugin.name": + self.info["plugin_name"] = value + elif prefix == "plugin.version": + self.info["plugin_version"] = value + elif prefix == "duration": + self.info["duration"] = value + + def _parse_event(self, event: Tuple[str, str, str]): + prefix, _, value = event + prefix_path = prefix.split(".") + + # asset / chunks / modules + if prefix_path[0] == "assets": + self._parse_assets_event(*event) + elif prefix_path[0] == "chunks": + self._parse_chunks_event(*event) + elif prefix_path[0] == "modules": + self._parse_modules_event(*event) + + # bundle name + elif prefix == "bundleName": + if not re.fullmatch(r"^[\w\d_:/@\.{}\[\]$-]+$", value): + log.info(f'bundle name does not match regex: "{value}"') + raise Exception("invalid bundle name") + bundle = self.db_session.query(Bundle).filter_by(name=value).first() + if bundle is None: + bundle = Bundle(name=value) + self.db_session.add(bundle) + self.session.bundle = bundle + + def _parse_assets_event(self, prefix: str, event: str, value: str): + if (prefix, event) == ("assets.item", "start_map"): + # new asset + assert self.asset is None + self.asset = Asset(session_id=self.session.id) + elif prefix == "assets.item.name": + self.asset.name = value + elif prefix == "assets.item.normalized": + self.asset.normalized_name = value + elif prefix == "assets.item.size": + self.asset.size = int(value) + elif (prefix, event) == ("assets.item", "end_map"): + self.asset_list.append( + dict( + session_id=self.asset.session_id, + name=self.asset.name, + normalized_name=self.asset.normalized_name, + size=self.asset.size, + gzip_size=self.asset.size // 1000, + uuid=str(uuid.uuid4()), + asset_type=self._asset_type(self.asset.name), + ) + ) + + # reset parser state + self.asset = None + + def _parse_chunks_event(self, prefix: str, event: str, value: str): + if (prefix, event) == ("chunks.item", "start_map"): + # new chunk + assert self.chunk is None + self.chunk = Chunk(session_id=self.session.id) + elif prefix == "chunks.item.id": + self.chunk.external_id = value + elif prefix == "chunks.item.uniqueId": + self.chunk.unique_external_id = value + elif prefix == "chunks.item.initial": + self.chunk.initial = value + elif prefix == "chunks.item.entry": + self.chunk.entry = value + elif prefix == "chunks.item.files.item": + self.chunk_asset_names.append(value) + elif (prefix, event) == ("chunks.item", "end_map"): + self.chunk_list.append( + dict( + session_id=self.chunk.session_id, + external_id=self.chunk.external_id, + unique_external_id=self.chunk.unique_external_id, + initial=self.chunk.initial, + entry=self.chunk.entry, + ) + ) + + self.chunk_asset_names_index[self.chunk.unique_external_id] = ( + self.chunk_asset_names + ) + # reset parser state + self.chunk = None + self.chunk_asset_names = [] + + def _parse_modules_event(self, prefix: str, event: str, value: str): + if (prefix, event) == ("modules.item", "start_map"): + # new module + assert self.module is None + self.module = Module(session_id=self.session.id) + elif prefix == "modules.item.name": + self.module.name = value + elif prefix == "modules.item.size": + self.module.size = int(value) + elif prefix == "modules.item.chunkUniqueIds.item": + self.module_chunk_unique_external_ids.append(value) + elif (prefix, event) == ("modules.item", "end_map"): + self.module_list.append( + dict( + session_id=self.module.session_id, + name=self.module.name, + size=self.module.size, + ) + ) + + self.module_chunk_unique_external_ids_index[self.module.name] = ( + self.module_chunk_unique_external_ids + ) + # reset parser state + self.module = None + self.module_chunk_unique_external_ids = [] + + def _create_associations(self): + # associate chunks to assets + inserts = [] + assets: list[Asset] = ( + self.db_session.query(Asset) + .filter( + Asset.session_id == self.session.id, + ) + .all() + ) + + asset_name_to_id = {asset.name: asset.id for asset in assets} + + chunks: list[Chunk] = ( + self.db_session.query(Chunk) + .filter( + Chunk.session_id == self.session.id, + ) + .all() + ) + + chunk_unique_id_to_id = {chunk.unique_external_id: chunk.id for chunk in chunks} + + modules = ( + self.db_session.query(Module) + .filter( + Module.session_id == self.session.id, + ) + .all() + ) + + for chunk in chunks: + chunk_id = chunk.id + asset_names = self.chunk_asset_names_index[chunk.unique_external_id] + inserts.extend( + [ + dict(asset_id=asset_name_to_id[asset_name], chunk_id=chunk_id) + for asset_name in asset_names + ] + ) + if inserts: + self.db_session.execute(assets_chunks.insert(), inserts) + + # associate modules to chunks + # FIXME: this isn't quite right - need to sort out how non-JS assets reference chunks + inserts = [] + + modules: list[Module] = self.db_session.query(Module).filter( + Module.session_id == self.session.id, + ) + for module in modules: + module_id = module.id + chunk_unique_external_ids = self.module_chunk_unique_external_ids_index[ + module.name + ] + + inserts.extend( + [ + dict( + chunk_id=chunk_unique_id_to_id[unique_external_id], + module_id=module_id, + ) + for unique_external_id in chunk_unique_external_ids + ] + ) + if inserts: + self.db_session.execute(chunks_modules.insert(), inserts) diff --git a/shared/bundle_analysis/parsers/v2.py b/shared/bundle_analysis/parsers/v2.py new file mode 100644 index 00000000..1b530583 --- /dev/null +++ b/shared/bundle_analysis/parsers/v2.py @@ -0,0 +1,374 @@ +import json +import logging +import re +import uuid +from typing import Tuple + +import ijson +from sqlalchemy.orm import Session as DbSession + +from shared.bundle_analysis.models import ( + Asset, + AssetType, + Bundle, + Chunk, + Module, + Session, + assets_chunks, + chunks_modules, +) +from shared.bundle_analysis.utils import get_extension + +log = logging.getLogger(__name__) + + +""" +Version 2 Schema +{ + "version": "2", + "plugin": { + "name": str + "version": str + }, + "builtAt": int, + "duration": int, + "bundler": { "name": str, "version": str }, + "bundleName": str, + "assets": [{ + "name": str", + "size": int, + "gzipSize": int, + "normalized": str + }], + "chunks": [{ + "id": str, + "uniqueId": str, + "entry": bool, + "initial": bool, + "files": [str], + "names": [str] + }], + "modules": [{ + "name": str, + "size": int, + "chunkUniqueIds": [str] + }] +} +""" + + +class ParserV2: + """ + This does a streaming JSON parse of the stats JSON file referenced by `path`. + It's more complicated that just doing a `json.loads` but should keep our memory + usage constrained. + """ + + def __init__(self, db_session: DbSession): + self.db_session = db_session + + def reset(self): + """ + Resets temporary parser state in order to parse a new file path. + """ + # chunk unique id -> asset name list + self.chunk_asset_names_index = {} + + # module name -> chunk external id list + self.module_chunk_unique_external_ids_index = {} + + # misc. top-level info from the stats data (i.e. bundler version, bundle time, etc.) + self.info = {} + + # temporary parser state + self.session = None + self.asset = None + self.chunk = None + self.chunk_asset_names = [] + self.module = None + self.module_chunk_unique_external_ids = [] + + self.asset_list = [] + self.chunk_list = [] + self.module_list = [] + + def parse(self, path: str) -> int: + try: + self.reset() + + # Retrieve the info section first before parsing all the other things + # this way when an error is raised we know which bundle plugin caused it + with open(path, "rb") as f: + for event in ijson.parse(f): + self._parse_info(event) + + self.session = Session(info={}) + self.db_session.add(self.session) + self.db_session.flush() + + with open(path, "rb") as f: + for event in ijson.parse(f): + self._parse_event(event) + + if self.asset_list: + insert_asset = Asset.__table__.insert().values(self.asset_list) + self.db_session.execute(insert_asset) + + if self.chunk_list: + insert_chunks = Chunk.__table__.insert().values(self.chunk_list) + self.db_session.execute(insert_chunks) + + if self.module_list: + insert_modules = Module.__table__.insert().values(self.module_list) + self.db_session.execute(insert_modules) + + self.db_session.flush() + + # Delete old session/asset/chunk/module with the same bundle name if applicable + old_session = ( + self.db_session.query(Session) + .filter( + Session.bundle == self.session.bundle, + Session.id != self.session.id, + ) + .one_or_none() + ) + if old_session: + for model in [Asset, Chunk, Module]: + to_be_deleted = self.db_session.query(model).filter( + model.session == old_session + ) + for item in to_be_deleted: + self.db_session.delete(item) + self.db_session.flush() + self.db_session.delete(old_session) + self.db_session.flush() + + # save top level bundle stats info + self.session.info = json.dumps(self.info) + + # this happens last so that we could potentially handle any ordering + # of top-level keys inside the JSON (i.e. we couldn't associate a chunk + # to an asset above if we parse the chunk before the asset) + self._create_associations() + + assert self.session.bundle is not None + return self.session.id + except Exception as e: + # Inject the plugin name to the Exception object so we have visibilitity on which plugin + # is causing the trouble. + e.bundle_analysis_plugin_name = self.info.get("plugin_name", "unknown") + raise e + + def _asset_type(self, name: str) -> AssetType: + extension = get_extension(name) + + if extension in ["js"]: + return AssetType.JAVASCRIPT + if extension in ["css"]: + return AssetType.STYLESHEET + if extension in ["woff", "woff2", "ttf", "otf", "eot"]: + return AssetType.FONT + if extension in ["jpg", "jpeg", "png", "gif", "svg", "webp", "apng", "avif"]: + return AssetType.IMAGE + + return AssetType.UNKNOWN + + def _parse_info(self, event: Tuple[str, str, str]): + prefix, _, value = event + + # session info + if prefix == "version": + self.info["version"] = value + elif prefix == "bundler.name": + self.info["bundler_name"] = value + elif prefix == "bundler.version": + self.info["bundler_version"] = value + elif prefix == "builtAt": + self.info["built_at"] = value + elif prefix == "plugin.name": + self.info["plugin_name"] = value + elif prefix == "plugin.version": + self.info["plugin_version"] = value + elif prefix == "duration": + self.info["duration"] = value + + def _parse_event(self, event: Tuple[str, str, str]): + prefix, _, value = event + prefix_path = prefix.split(".") + + # asset / chunks / modules + if prefix_path[0] == "assets": + self._parse_assets_event(*event) + elif prefix_path[0] == "chunks": + self._parse_chunks_event(*event) + elif prefix_path[0] == "modules": + self._parse_modules_event(*event) + + # bundle name + elif prefix == "bundleName": + if not re.fullmatch(r"^[\w\d_:/@\.{}\[\]$-]+$", value): + log.info(f'bundle name does not match regex: "{value}"') + raise Exception("invalid bundle name") + bundle = self.db_session.query(Bundle).filter_by(name=value).first() + if bundle is None: + bundle = Bundle(name=value) + self.db_session.add(bundle) + self.session.bundle = bundle + + def _parse_assets_event(self, prefix: str, event: str, value: str): + if (prefix, event) == ("assets.item", "start_map"): + # new asset + assert self.asset is None + self.asset = Asset(session_id=self.session.id) + elif prefix == "assets.item.name": + self.asset.name = value + elif prefix == "assets.item.normalized": + self.asset.normalized_name = value + elif prefix == "assets.item.size": + self.asset.size = int(value) + elif prefix == "assets.item.gzipSize": + self.asset.gzip_size = int(value) + elif (prefix, event) == ("assets.item", "end_map"): + self.asset_list.append( + dict( + session_id=self.asset.session_id, + name=self.asset.name, + normalized_name=self.asset.normalized_name, + size=self.asset.size, + gzip_size=self.asset.gzip_size, + uuid=str(uuid.uuid4()), + asset_type=self._asset_type(self.asset.name), + ) + ) + + # reset parser state + self.asset = None + + def _parse_chunks_event(self, prefix: str, event: str, value: str): + if (prefix, event) == ("chunks.item", "start_map"): + # new chunk + assert self.chunk is None + self.chunk = Chunk(session_id=self.session.id) + elif prefix == "chunks.item.id": + self.chunk.external_id = value + elif prefix == "chunks.item.uniqueId": + self.chunk.unique_external_id = value + elif prefix == "chunks.item.initial": + self.chunk.initial = value + elif prefix == "chunks.item.entry": + self.chunk.entry = value + elif prefix == "chunks.item.files.item": + self.chunk_asset_names.append(value) + elif (prefix, event) == ("chunks.item", "end_map"): + self.chunk_list.append( + dict( + session_id=self.chunk.session_id, + external_id=self.chunk.external_id, + unique_external_id=self.chunk.unique_external_id, + initial=self.chunk.initial, + entry=self.chunk.entry, + ) + ) + + self.chunk_asset_names_index[self.chunk.unique_external_id] = ( + self.chunk_asset_names + ) + # reset parser state + self.chunk = None + self.chunk_asset_names = [] + + def _parse_modules_event(self, prefix: str, event: str, value: str): + if (prefix, event) == ("modules.item", "start_map"): + # new module + assert self.module is None + self.module = Module(session_id=self.session.id) + elif prefix == "modules.item.name": + self.module.name = value + elif prefix == "modules.item.size": + self.module.size = int(value) + elif prefix == "modules.item.chunkUniqueIds.item": + self.module_chunk_unique_external_ids.append(value) + elif (prefix, event) == ("modules.item", "end_map"): + self.module_list.append( + dict( + session_id=self.module.session_id, + name=self.module.name, + size=self.module.size, + ) + ) + + self.module_chunk_unique_external_ids_index[self.module.name] = ( + self.module_chunk_unique_external_ids + ) + # reset parser state + self.module = None + self.module_chunk_unique_external_ids = [] + + def _create_associations(self): + # associate chunks to assets + inserts = [] + assets: list[Asset] = ( + self.db_session.query(Asset) + .filter( + Asset.session_id == self.session.id, + ) + .all() + ) + + asset_name_to_id = {asset.name: asset.id for asset in assets} + + chunks: list[Chunk] = ( + self.db_session.query(Chunk) + .filter( + Chunk.session_id == self.session.id, + ) + .all() + ) + + chunk_unique_id_to_id = {chunk.unique_external_id: chunk.id for chunk in chunks} + + modules = ( + self.db_session.query(Module) + .filter( + Module.session_id == self.session.id, + ) + .all() + ) + + for chunk in chunks: + chunk_id = chunk.id + asset_names = self.chunk_asset_names_index[chunk.unique_external_id] + inserts.extend( + [ + dict(asset_id=asset_name_to_id[asset_name], chunk_id=chunk_id) + for asset_name in asset_names + ] + ) + if inserts: + self.db_session.execute(assets_chunks.insert(), inserts) + + # associate modules to chunks + # FIXME: this isn't quite right - need to sort out how non-JS assets reference chunks + inserts = [] + + modules: list[Module] = self.db_session.query(Module).filter( + Module.session_id == self.session.id, + ) + for module in modules: + module_id = module.id + chunk_unique_external_ids = self.module_chunk_unique_external_ids_index[ + module.name + ] + + inserts.extend( + [ + dict( + chunk_id=chunk_unique_id_to_id[unique_external_id], + module_id=module_id, + ) + for unique_external_id in chunk_unique_external_ids + ] + ) + if inserts: + self.db_session.execute(chunks_modules.insert(), inserts) diff --git a/shared/bundle_analysis/report.py b/shared/bundle_analysis/report.py index da658212..3eb4087c 100644 --- a/shared/bundle_analysis/report.py +++ b/shared/bundle_analysis/report.py @@ -68,6 +68,10 @@ def hashed_name(self): def size(self): return self.asset.size + @property + def gzip_size(self): + return self.asset.gzip_size + @property def uuid(self): return self.asset.uuid @@ -225,7 +229,7 @@ def ingest(self, path: str) -> int: Ingest the bundle stats JSON at the given file path. Returns session ID of ingested data. """ - parser = Parser(self.db_session) + parser = Parser(path, self.db_session).get_proper_parser() session_id = parser.parse(path) self.db_session.commit() return session_id diff --git a/tests/samples/sample_bundle_stats.json b/tests/samples/sample_bundle_stats.json index c92b2f8b..bb0ac8d2 100644 --- a/tests/samples/sample_bundle_stats.json +++ b/tests/samples/sample_bundle_stats.json @@ -1,5 +1,5 @@ { - "version": "1", + "version": "2", "plugin": { "name": "codecov-vite-bundle-analysis-plugin", "version": "1.0.0" @@ -12,26 +12,31 @@ { "name": "assets/react-35ef61ed.svg", "size": 4126, + "gzipSize": 4125, "normalized": "assets/react-*.svg" }, { "name": "assets/index-d526a0c5.css", "size": 1421, + "gzipSize": 1420, "normalized": "assets/index-*.css" }, { "name": "assets/LazyComponent-fcbb0922.js", "size": 294, + "gzipSize": 293, "normalized": "assets/LazyComponent-*.js" }, { "name": "assets/index-c8676264.js", "size": 154, + "gzipSize": 153, "normalized": "assets/index-*.js" }, { "name": "assets/index-666d2e09.js", "size": 144577, + "gzipSize": 144576, "normalized": "assets/index-*.js" } ], diff --git a/tests/samples/sample_bundle_stats_other.json b/tests/samples/sample_bundle_stats_other.json index f5c0d713..2a44d7ec 100644 --- a/tests/samples/sample_bundle_stats_other.json +++ b/tests/samples/sample_bundle_stats_other.json @@ -1,5 +1,5 @@ { - "version": "1", + "version": "2", "plugin": { "name": "codecov-vite-bundle-analysis-plugin", "version": "1.0.0" @@ -12,26 +12,31 @@ { "name": "assets/other-35ef61ed.svg", "size": 5126, + "gzipSize": 5125, "normalized": "assets/other-*.svg" }, { "name": "assets/index-d526a0c5.css", "size": 1421, + "gzipSize": 1420, "normalized": "assets/index-*.css" }, { "name": "assets/LazyComponent-fcbb0922.js", "size": 294, + "gzipSize": 293, "normalized": "assets/LazyComponent-*.js" }, { "name": "assets/index-c8676264.js", "size": 254, + "gzipSize": 253, "normalized": "assets/index-*.js" }, { "name": "assets/index-666d2e09.js", "size": 144577, + "gzipSize": 144576, "normalized": "assets/index-*.js" } ], diff --git a/tests/samples/sample_bundle_stats_v1.json b/tests/samples/sample_bundle_stats_v1.json new file mode 100644 index 00000000..86b16b10 --- /dev/null +++ b/tests/samples/sample_bundle_stats_v1.json @@ -0,0 +1,201 @@ +{ + "version": "1", + "plugin": { + "name": "codecov-vite-bundle-analysis-plugin", + "version": "1.0.0" + }, + "builtAt": 1701451048604, + "duration": 331, + "bundler": { "name": "rollup", "version": "3.29.4" }, + "bundleName": "sample", + "assets": [ + { + "name": "assets/react-35ef61ed.svg", + "size": 4126, + "normalized": "assets/react-*.svg" + }, + { + "name": "assets/index-d526a0c5.css", + "size": 1421, + "normalized": "assets/index-*.css" + }, + { + "name": "assets/LazyComponent-fcbb0922.js", + "size": 294, + "normalized": "assets/LazyComponent-*.js" + }, + { + "name": "assets/index-c8676264.js", + "size": 154, + "normalized": "assets/index-*.js" + }, + { + "name": "assets/index-666d2e09.js", + "size": 144577, + "normalized": "assets/index-*.js" + } + ], + "chunks": [ + { + "id": "LazyComponent", + "uniqueId": "0-LazyComponent", + "entry": false, + "initial": true, + "files": ["assets/LazyComponent-fcbb0922.js"], + "names": ["LazyComponent"] + }, + { + "id": "index", + "uniqueId": "1-index", + "entry": false, + "initial": true, + "files": ["assets/index-c8676264.js"], + "names": ["index"] + }, + { + "id": "index", + "uniqueId": "2-index", + "entry": true, + "initial": true, + "files": ["assets/index-666d2e09.js"], + "names": ["index"] + } + ], + "modules": [ + { + "name": "./src/LazyComponent/LazyComponent.tsx", + "size": 497, + "chunkUniqueIds": ["0-LazyComponent"] + }, + { + "name": "./src/IndexedLazyComponent/IndexedLazyComponent.tsx", + "size": 189, + "chunkUniqueIds": ["1-index"] + }, + { + "name": "./src/IndexedLazyComponent/index.ts", + "size": 0, + "chunkUniqueIds": ["1-index"] + }, + { + "name": "./vite/modulepreload-polyfill", + "size": 1548, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "./commonjsHelpers.js", + "size": 140, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/jsx-runtime.js?commonjs-module", + "size": 31, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/cjs/react-jsx-runtime.production.min.js?commonjs-exports", + "size": 40, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/index.js?commonjs-module", + "size": 26, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/cjs/react.production.min.js?commonjs-exports", + "size": 30, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/cjs/react.production.min.js", + "size": 7591, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/index.js", + "size": 144, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/cjs/react-jsx-runtime.production.min.js", + "size": 919, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react@18.2.0/node_modules/react/jsx-runtime.js", + "size": 103, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react-dom@18.2.0_react@18.2.0/node_modules/react-dom/client.js?commonjs-exports", + "size": 16, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react-dom@18.2.0_react@18.2.0/node_modules/react-dom/index.js?commonjs-module", + "size": 29, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react-dom@18.2.0_react@18.2.0/node_modules/react-dom/cjs/react-dom.production.min.js?commonjs-exports", + "size": 33, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/scheduler@0.23.0/node_modules/scheduler/index.js?commonjs-module", + "size": 30, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/scheduler@0.23.0/node_modules/scheduler/cjs/scheduler.production.min.js?commonjs-exports", + "size": 34, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/scheduler@0.23.0/node_modules/scheduler/cjs/scheduler.production.min.js", + "size": 4315, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/scheduler@0.23.0/node_modules/scheduler/index.js", + "size": 94, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react-dom@18.2.0_react@18.2.0/node_modules/react-dom/cjs/react-dom.production.min.js", + "size": 132340, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react-dom@18.2.0_react@18.2.0/node_modules/react-dom/index.js", + "size": 755, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../node_modules/.pnpm/react-dom@18.2.0_react@18.2.0/node_modules/react-dom/client.js", + "size": 102, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "./vite/preload-helper", + "size": 2488, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "./src/assets/react.svg", + "size": 45, + "chunkUniqueIds": ["2-index"] + }, + { + "name": "../../../../../../vite.svg", + "size": 51, + "chunkUniqueIds": ["2-index"] + }, + { "name": "./src/App.css", "size": 17, "chunkUniqueIds": ["2-index"] }, + { "name": "./src/App.tsx", "size": 1977, "chunkUniqueIds": ["2-index"] }, + { "name": "./src/index.css", "size": 17, "chunkUniqueIds": ["2-index"] }, + { "name": "./src/main.tsx", "size": 181, "chunkUniqueIds": ["2-index"] }, + { "name": "./index.html", "size": 0, "chunkUniqueIds": ["2-index"] } + ] + } \ No newline at end of file diff --git a/tests/unit/bundle_analysis/test_bundle_analysis.py b/tests/unit/bundle_analysis/test_bundle_analysis.py index cf5f74d5..32976dcc 100644 --- a/tests/unit/bundle_analysis/test_bundle_analysis.py +++ b/tests/unit/bundle_analysis/test_bundle_analysis.py @@ -22,6 +22,10 @@ / "sample_bundle_stats_invalid_name.json" ) +sample_bundle_stats_path_4 = ( + Path(__file__).parent.parent.parent / "samples" / "sample_bundle_stats_v1.json" +) + def test_create_bundle_report(): try: @@ -43,13 +47,14 @@ def test_create_bundle_report(): asset_reports = list(bundle_report.asset_reports()) assert [ - (ar.name, ar.hashed_name, ar.size, len(ar.modules()), ar.asset_type) + (ar.name, ar.hashed_name, ar.size, ar.gzip_size, len(ar.modules()), ar.asset_type) for ar in asset_reports ] == [ ( "assets/react-*.svg", "assets/react-35ef61ed.svg", 4126, + 4125, 0, AssetType.IMAGE, ), @@ -57,6 +62,7 @@ def test_create_bundle_report(): "assets/index-*.css", "assets/index-d526a0c5.css", 1421, + 1420, 0, AssetType.STYLESHEET, ), @@ -64,6 +70,7 @@ def test_create_bundle_report(): "assets/LazyComponent-*.js", "assets/LazyComponent-fcbb0922.js", 294, + 293, 1, AssetType.JAVASCRIPT, ), @@ -71,6 +78,7 @@ def test_create_bundle_report(): "assets/index-*.js", "assets/index-c8676264.js", 154, + 153, 2, AssetType.JAVASCRIPT, ), @@ -80,6 +88,7 @@ def test_create_bundle_report(): "assets/index-*.js", "assets/index-666d2e09.js", 144577, + 144576, 28, AssetType.JAVASCRIPT, ), @@ -251,7 +260,7 @@ def test_bundle_report_info(): bundle_report = report.bundle_report("sample") bundle_report_info = bundle_report.info() - assert bundle_report_info["version"] == "1" + assert bundle_report_info["version"] == "2" assert bundle_report_info["bundler_name"] == "rollup" assert bundle_report_info["bundler_version"] == "3.29.4" assert bundle_report_info["built_at"] == 1701451048604 @@ -275,7 +284,7 @@ def test_bundle_report_size_integer(): def test_bundle_parser_error(): with patch( - "shared.bundle_analysis.parser.Parser._parse_assets_event", + "shared.bundle_analysis.parsers.ParserV1._parse_assets_event", side_effect=Exception("MockError"), ): report = BundleAnalysisReport() @@ -334,3 +343,84 @@ def test_bundle_file_save_unknown_error(): assert str(excinfo) == "UnknownError" assert type(excinfo) == Exception + + +def test_create_bundle_report_v1(): + try: + report = BundleAnalysisReport() + session_id = report.ingest(sample_bundle_stats_path_4) + assert session_id == 1 + + assert report.metadata() == { + MetadataKey.SCHEMA_VERSION: 2, + } + + bundle_reports = list(report.bundle_reports()) + assert len(bundle_reports) == 1 + + bundle_report = report.bundle_report("invalid") + assert bundle_report is None + bundle_report = report.bundle_report("sample") + + bundle_report_info = bundle_report.info() + assert bundle_report_info["version"] == "1" + + asset_reports = list(bundle_report.asset_reports()) + + assert [ + (ar.name, ar.hashed_name, ar.size, ar.gzip_size, len(ar.modules()), ar.asset_type) + for ar in asset_reports + ] == [ + ( + "assets/react-*.svg", + "assets/react-35ef61ed.svg", + 4126, + 4, + 0, + AssetType.IMAGE, + ), + ( + "assets/index-*.css", + "assets/index-d526a0c5.css", + 1421, + 1, + 0, + AssetType.STYLESHEET, + ), + ( + "assets/LazyComponent-*.js", + "assets/LazyComponent-fcbb0922.js", + 294, + 0, + 1, + AssetType.JAVASCRIPT, + ), + ( + "assets/index-*.js", + "assets/index-c8676264.js", + 154, + 0, + 2, + AssetType.JAVASCRIPT, + ), + # FIXME: this is wrong since it's capturing the SVG and CSS modules as well. + # Made a similar note in the parser code where the associations are made + ( + "assets/index-*.js", + "assets/index-666d2e09.js", + 144577, + 144, + 28, + AssetType.JAVASCRIPT, + ), + ] + + for ar in asset_reports: + for module in ar.modules(): + assert isinstance(module.name, str) + assert isinstance(module.size, int) + + assert bundle_report.total_size() == 150572 + assert report.session_count() == 1 + finally: + report.cleanup() \ No newline at end of file