From 70d0c5d589fe9c04970973e687907e115b49ebca Mon Sep 17 00:00:00 2001 From: Trevor Elkins <1447798+trevor-e@users.noreply.github.com> Date: Wed, 1 Jul 2026 17:17:07 +0000 Subject: [PATCH] Group DEX treemap by known libraries Recognized third-party libraries (AndroidX, Kotlin, Firebase, Sentry, etc.) are pulled out of the DEX package hierarchy and grouped under a single "Libraries" node, keyed by canonical library name with the matched package prefix stripped. First-party code keeps its normal package tree. Sizes are preserved since each class is placed exactly once. Adds a conservative, easy-to-extend catalog of Android libraries keyed by distinctive package prefixes. This is the Android analog of EME-139 (group iOS by known libraries). Generated with [Linear](https://linear.app/getsentry/issue/EME-108/group-dex-by-known-libraries#agent-session-d849543d) Co-authored-by: linear-code[bot] <222613912+linear-code[bot]@users.noreply.github.com> --- .../size/treemap/android_known_libraries.py | 113 +++++++++++++ .../size/treemap/dex_element_builder.py | 152 ++++++++++++------ .../treemap/test_android_known_libraries.py | 43 +++++ .../size/treemap/test_dex_element_builder.py | 133 +++++++++++++++ 4 files changed, 394 insertions(+), 47 deletions(-) create mode 100644 src/launchpad/size/treemap/android_known_libraries.py create mode 100644 tests/unit/size/treemap/test_android_known_libraries.py create mode 100644 tests/unit/size/treemap/test_dex_element_builder.py diff --git a/src/launchpad/size/treemap/android_known_libraries.py b/src/launchpad/size/treemap/android_known_libraries.py new file mode 100644 index 00000000..7357f335 --- /dev/null +++ b/src/launchpad/size/treemap/android_known_libraries.py @@ -0,0 +1,113 @@ +"""Catalog of well-known Android libraries for DEX treemap grouping. + +DEX classes are grouped into a package hierarchy (e.g. ``com`` > ``google`` > +``firebase`` > …). Third-party SDKs are indistinguishable from the app's own code +in that hierarchy. This catalog maps distinctive Java/Kotlin package prefixes to a +canonical library name so the treemap builder can pull recognized classes out and +group them under a single ``Libraries`` node. + +Keep entries conservative: use prefixes that are distinctive enough to avoid +colliding with first-party code. In particular, prefer specific sub-packages over +broad vendor roots (e.g. ``com.emergetools.snapshots`` rather than +``com.emergetools``, which would also match an app shipped by Emerge Tools). +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class KnownLibrary: + """A known Android library and the package prefixes used to recognize it.""" + + name: str + # Distinctive package prefixes that map to this library. Matching is done on + # package boundaries: a prefix matches an FQN that equals it or starts with + # ``prefix + "."``. + package_prefixes: tuple[str, ...] + + +# Curated, non-exhaustive catalog of popular Android libraries. Extend as needed. +KNOWN_LIBRARIES: tuple[KnownLibrary, ...] = ( + # Jetpack / Kotlin + KnownLibrary("AndroidX", ("androidx",)), + KnownLibrary("Android Support", ("android.support",)), + KnownLibrary("Kotlin", ("kotlin", "kotlinx")), + # Google + KnownLibrary("Firebase", ("com.google.firebase",)), + KnownLibrary("Google Play Services", ("com.google.android.gms",)), + KnownLibrary("Google Play Core", ("com.google.android.play",)), + KnownLibrary("Material Components", ("com.google.android.material",)), + KnownLibrary("Gson", ("com.google.gson",)), + KnownLibrary("Guava", ("com.google.common",)), + KnownLibrary("Protobuf", ("com.google.protobuf",)), + KnownLibrary("Dagger", ("dagger",)), + # Square + KnownLibrary("OkHttp", ("okhttp3",)), + KnownLibrary("Okio", ("okio",)), + KnownLibrary("Retrofit", ("retrofit2",)), + KnownLibrary("Moshi", ("com.squareup.moshi",)), + KnownLibrary("Picasso", ("com.squareup.picasso",)), + KnownLibrary("LeakCanary", ("leakcanary", "com.squareup.leakcanary")), + # Image loading + KnownLibrary("Coil", ("coil",)), + KnownLibrary("Glide", ("com.bumptech.glide",)), + # Reactive / async + KnownLibrary("RxJava", ("io.reactivex", "rx")), + KnownLibrary("Timber", ("timber.log",)), + KnownLibrary("EventBus", ("org.greenrobot.eventbus",)), + # Serialization + KnownLibrary("Jackson", ("com.fasterxml.jackson",)), + KnownLibrary("Apache Commons", ("org.apache.commons",)), + # Observability + KnownLibrary("Sentry", ("io.sentry",)), + KnownLibrary("Bugsnag", ("com.bugsnag",)), + KnownLibrary("Datadog", ("com.datadog",)), + # Analytics / attribution + KnownLibrary("Amplitude", ("com.amplitude",)), + KnownLibrary("Mixpanel", ("com.mixpanel",)), + KnownLibrary("Segment", ("com.segment.analytics",)), + KnownLibrary("Adjust", ("com.adjust.sdk",)), + KnownLibrary("AppsFlyer", ("com.appsflyer",)), + KnownLibrary("Branch", ("io.branch",)), + KnownLibrary("Braze", ("com.braze", "com.appboy")), + # Other popular SDKs + KnownLibrary("Facebook", ("com.facebook",)), + KnownLibrary("Stripe", ("com.stripe",)), + KnownLibrary("Lottie", ("com.airbnb.lottie",)), + KnownLibrary("Realm", ("io.realm",)), + KnownLibrary("Koin", ("org.koin",)), + KnownLibrary("Ktor", ("io.ktor",)), + KnownLibrary("ThreeTenABP", ("org.threeten.bp", "com.jakewharton.threetenabp")), + KnownLibrary("EmergeTools", ("com.emergetools.snapshots",)), +) + + +def _build_prefix_index() -> tuple[tuple[str, str], ...]: + # Sort by descending prefix length so the most specific prefix wins (e.g. + # ``com.google.firebase`` is preferred over a hypothetical ``com.google``). + pairs = [(prefix, library.name) for library in KNOWN_LIBRARIES for prefix in library.package_prefixes] + pairs.sort(key=lambda pair: len(pair[0]), reverse=True) + return tuple(pairs) + + +_PREFIX_INDEX = _build_prefix_index() + + +def resolve_known_library(fqn: str) -> tuple[str, str] | None: + """Resolve a class FQN to a known library. + + Returns ``(library_name, matched_prefix)`` for the most specific matching + package prefix, or ``None`` when the FQN doesn't belong to a known library. + Matching is done on package boundaries so ``androidx`` matches + ``androidx.core.App`` but not ``androidxfoo.Bar``. + """ + if not fqn: + return None + + for prefix, library_name in _PREFIX_INDEX: + if fqn == prefix or fqn.startswith(prefix + "."): + return library_name, prefix + + return None diff --git a/src/launchpad/size/treemap/dex_element_builder.py b/src/launchpad/size/treemap/dex_element_builder.py index 94a9d28b..db72b0e8 100644 --- a/src/launchpad/size/treemap/dex_element_builder.py +++ b/src/launchpad/size/treemap/dex_element_builder.py @@ -3,11 +3,14 @@ from launchpad.parsers.android.dex.types import ClassDefinition from launchpad.size.models.common import FileInfo from launchpad.size.models.treemap import TreemapElement, TreemapType +from launchpad.size.treemap.android_known_libraries import resolve_known_library from launchpad.size.treemap.treemap_element_builder import TreemapElementBuilder from launchpad.utils.logging import get_logger logger = get_logger(__name__) +LIBRARIES_NODE_NAME = "Libraries" + class DexElementBuilder(TreemapElementBuilder): def __init__( @@ -25,8 +28,8 @@ def build_element(self, file_info: FileInfo, display_name: str) -> TreemapElemen # to build the treemap. This is because there could be multiple # DEX files in APK and we want to group them by package vs file. - root_packages = self._build_package_tree() - size = sum(package.size for package in root_packages) + children = self._build_children() + size = sum(child.size for child in children) return TreemapElement( name=display_name, @@ -34,13 +37,49 @@ def build_element(self, file_info: FileInfo, display_name: str) -> TreemapElemen type=TreemapType.DEX, path=file_info.path, is_dir=True, - children=root_packages, + children=children, ) - def _build_package_tree(self) -> list[TreemapElement]: - package_tree: dict[str, dict] = {} + def _build_children(self) -> list[TreemapElement]: + """Partition classes into known third-party libraries and first-party code. + + Recognized library classes are grouped under a single ``Libraries`` node + (keyed by canonical library name, with the matched package prefix stripped + from the nested hierarchy). Everything else keeps its normal package tree. + Sizes are preserved since each class is placed exactly once. + """ + # library name -> list of (package_parts, class_name, class_def) with the + # matched library prefix stripped from the package path. + library_entries: dict[str, list[tuple[list[str], str, ClassDefinition]]] = {} + first_party: list[ClassDefinition] = [] for class_def in self.class_definitions: + fqn = class_def.fqn() + match = resolve_known_library(fqn) + if match is None: + first_party.append(class_def) + continue + + library_name, matched_prefix = match + remainder = fqn[len(matched_prefix) + 1 :] if fqn.startswith(matched_prefix + ".") else "" + parts = remainder.split(".") if remainder else [] + package_parts = parts[:-1] + class_name = parts[-1] if parts else class_def.get_name() + library_entries.setdefault(library_name, []).append((package_parts, class_name, class_def)) + + children = self._build_package_elements(first_party) + + libraries_node = self._build_libraries_node(library_entries) + if libraries_node is not None: + children.insert(0, libraries_node) + + return children + + def _build_package_elements(self, class_definitions: list[ClassDefinition]) -> list[TreemapElement]: + """Build the package hierarchy for a set of classes keyed by their FQN.""" + root = self._new_node() + + for class_def in class_definitions: fqn = class_def.fqn() parts = fqn.split(".") @@ -48,54 +87,72 @@ def _build_package_tree(self) -> list[TreemapElement]: logger.warning(f"Invalid class definition with no package: {fqn}") continue - class_name = parts[-1] - package_parts = parts[:-1] + self._insert_class(root, parts[:-1], parts[-1], class_def) - # Build the package hierarchy - current_level = package_tree - for package_part in package_parts: - if package_part not in current_level: - current_level[package_part] = {"packages": {}, "classes": {}} - current_level = current_level[package_part]["packages"] - - # Add the class to the leaf package - leaf_package = package_tree - for package_part in package_parts: - if package_part not in leaf_package: - leaf_package[package_part] = {"packages": {}, "classes": {}} - if package_part == package_parts[-1]: - # This is the final package, add the class here - leaf_package[package_part]["classes"][class_name] = {"class_def": class_def} - else: - # Navigate to the next level - leaf_package = leaf_package[package_part]["packages"] - - return self._convert_tree_to_elements(package_tree) - - def _convert_tree_to_elements(self, package_tree: dict[str, dict], parent_path: str = "") -> list[TreemapElement]: - elements: list[TreemapElement] = [] + return self._node_to_elements(root) - for name, node in package_tree.items(): - package_path = f"{parent_path}.{name}" if parent_path else f"{name}" + def _build_libraries_node( + self, + library_entries: dict[str, list[tuple[list[str], str, ClassDefinition]]], + ) -> TreemapElement | None: + if not library_entries: + return None + + library_children: list[TreemapElement] = [] + for library_name, entries in library_entries.items(): + root = self._new_node() + for package_parts, class_name, class_def in entries: + self._insert_class(root, package_parts, class_name, class_def) + + children = self._node_to_elements(root, parent_path=library_name) + library_children.append( + TreemapElement( + name=library_name, + size=sum(child.size for child in children), + type=TreemapType.DEX, + path=library_name, + is_dir=True, + children=children, + ) + ) - # Process sub-packages - children = [] - if "packages" in node: - children.extend(self._convert_tree_to_elements(node["packages"], package_path)) + library_children.sort(key=lambda child: child.size, reverse=True) - # Process classes in this package - if "classes" in node: - for class_name, class_node in node["classes"].items(): - class_def = class_node["class_def"] - class_element = self._create_class_element(class_def) - children.append(class_element) + return TreemapElement( + name=LIBRARIES_NODE_NAME, + size=sum(child.size for child in library_children), + type=TreemapType.DEX, + path=LIBRARIES_NODE_NAME, + is_dir=True, + children=library_children, + ) - total_size = sum(child.size for child in children) + @staticmethod + def _new_node() -> dict: + return {"packages": {}, "classes": {}} + def _insert_class( + self, + root: dict, + package_parts: list[str], + class_name: str, + class_def: ClassDefinition, + ) -> None: + node = root + for part in package_parts: + node = node["packages"].setdefault(part, self._new_node()) + node["classes"][class_name] = class_def + + def _node_to_elements(self, node: dict, parent_path: str = "") -> list[TreemapElement]: + elements: list[TreemapElement] = [] + + for name, child_node in node["packages"].items(): + package_path = f"{parent_path}.{name}" if parent_path else name + children = self._node_to_elements(child_node, package_path) elements.append( TreemapElement( name=name, - size=total_size, + size=sum(child.size for child in children), type=TreemapType.DEX, path=package_path, is_dir=True, @@ -103,14 +160,15 @@ def _convert_tree_to_elements(self, package_tree: dict[str, dict], parent_path: ) ) + for class_def in node["classes"].values(): + elements.append(self._create_class_element(class_def)) + return elements def _create_class_element(self, class_def: ClassDefinition) -> TreemapElement: - class_size = class_def.size - return TreemapElement( name=class_def.get_name(), - size=class_size, + size=class_def.size, type=TreemapType.DEX, path=class_def.fqn(), is_dir=False, diff --git a/tests/unit/size/treemap/test_android_known_libraries.py b/tests/unit/size/treemap/test_android_known_libraries.py new file mode 100644 index 00000000..e2c0dba1 --- /dev/null +++ b/tests/unit/size/treemap/test_android_known_libraries.py @@ -0,0 +1,43 @@ +"""Unit tests for the Android known-libraries catalog.""" + +from launchpad.size.treemap.android_known_libraries import resolve_known_library + + +def test_resolves_prefix_to_library() -> None: + assert resolve_known_library("androidx.core.app.NotificationCompat") == ("AndroidX", "androidx") + assert resolve_known_library("kotlin.collections.CollectionsKt") == ("Kotlin", "kotlin") + assert resolve_known_library("io.sentry.Sentry") == ("Sentry", "io.sentry") + + +def test_most_specific_prefix_wins() -> None: + # com.google.firebase is more specific than any shorter com.google.* prefix. + assert resolve_known_library("com.google.firebase.analytics.FirebaseAnalytics") == ( + "Firebase", + "com.google.firebase", + ) + assert resolve_known_library("com.google.android.gms.tasks.Task") == ( + "Google Play Services", + "com.google.android.gms", + ) + + +def test_matches_on_package_boundaries() -> None: + # Exact prefix match (class living directly in the prefix package). + assert resolve_known_library("androidx") == ("AndroidX", "androidx") + # A package that merely starts with the prefix string must not match. + assert resolve_known_library("androidxtra.Foo") is None + assert resolve_known_library("kotlinpoet.Foo") is None + + +def test_emergetools_snapshots_is_grouped_but_app_is_not() -> None: + assert resolve_known_library("com.emergetools.snapshots.SnapshotTest") == ( + "EmergeTools", + "com.emergetools.snapshots", + ) + # A first-party Emerge Tools app must not be grouped as a library. + assert resolve_known_library("com.emergetools.hackernews.MainActivity") is None + + +def test_unknown_and_empty_names_return_none() -> None: + assert resolve_known_library("com.example.myapp.MainActivity") is None + assert resolve_known_library("") is None diff --git a/tests/unit/size/treemap/test_dex_element_builder.py b/tests/unit/size/treemap/test_dex_element_builder.py new file mode 100644 index 00000000..6f65e348 --- /dev/null +++ b/tests/unit/size/treemap/test_dex_element_builder.py @@ -0,0 +1,133 @@ +"""Unit tests for DEX treemap grouping by known libraries.""" + +from __future__ import annotations + +from launchpad.parsers.android.dex.android_code_utils import AndroidCodeUtils +from launchpad.parsers.android.dex.types import ClassDefinition +from launchpad.size.models.common import FileInfo +from launchpad.size.models.treemap import TreemapElement, TreemapType +from launchpad.size.treemap.dex_element_builder import DexElementBuilder + + +def _class_def(fqn: str, size: int) -> ClassDefinition: + return ClassDefinition( + size=size, + signature=AndroidCodeUtils.fqn_to_class_signature(fqn), + source_file_name=None, + interfaces=[], + annotations=[], + access_flags=[], + fields=[], + methods=[], + ) + + +def _file_info() -> FileInfo: + return FileInfo( + path="Dex", + hash="", + full_path=None, + size=0, + file_type="dex", + treemap_type=TreemapType.DEX, + is_dir=False, + ) + + +def _build(class_defs: list[ClassDefinition]) -> TreemapElement: + builder = DexElementBuilder(filesystem_block_size=4096, class_definitions=class_defs) + element = builder.build_element(_file_info(), "Dex") + assert element is not None + return element + + +def _find_child(element: TreemapElement, name: str) -> TreemapElement | None: + for child in element.children: + if child.name == name: + return child + return None + + +def test_known_library_grouped_first_party_stays_flat() -> None: + element = _build( + [ + _class_def("androidx.core.app.NotificationCompat", 500), + _class_def("com.example.myapp.MainActivity", 300), + ] + ) + + # First-party code stays in the normal package tree. + assert _find_child(element, "Libraries") is not None + com = _find_child(element, "com") + assert com is not None + assert com.size == 300 + + # The recognized library is moved under the "Libraries" node. + assert _find_child(element, "androidx") is None + libraries = _find_child(element, "Libraries") + assert libraries is not None + assert libraries.type == TreemapType.DEX + androidx = _find_child(libraries, "AndroidX") + assert androidx is not None + assert androidx.size == 500 + + +def test_matched_prefix_is_stripped_inside_library_node() -> None: + element = _build([_class_def("androidx.core.app.NotificationCompat", 500)]) + + libraries = _find_child(element, "Libraries") + assert libraries is not None + androidx = _find_child(libraries, "AndroidX") + assert androidx is not None + + # The "androidx" prefix is stripped: the hierarchy resumes at "core". + core = _find_child(androidx, "core") + assert core is not None + assert _find_child(androidx, "androidx") is None + # Class leaves keep their full FQN as the path. + app = _find_child(core, "app") + assert app is not None + leaf = app.children[0] + assert leaf.name == "NotificationCompat" + assert leaf.path == "androidx.core.app.NotificationCompat" + + +def test_multiple_prefixes_collapse_into_one_library() -> None: + element = _build( + [ + _class_def("kotlin.collections.CollectionsKt", 400), + _class_def("kotlinx.coroutines.CoroutineScope", 600), + ] + ) + + libraries = _find_child(element, "Libraries") + assert libraries is not None + kotlin = _find_child(libraries, "Kotlin") + assert kotlin is not None + assert kotlin.size == 1000 + assert {child.name for child in kotlin.children} == {"collections", "coroutines"} + + +def test_no_libraries_node_when_nothing_recognized() -> None: + element = _build([_class_def("com.example.myapp.MainActivity", 300)]) + + assert _find_child(element, "Libraries") is None + assert _find_child(element, "com") is not None + + +def test_total_size_preserved_and_libraries_sorted_by_size() -> None: + element = _build( + [ + _class_def("androidx.core.app.NotificationCompat", 500), + _class_def("io.sentry.Sentry", 900), + _class_def("com.example.myapp.MainActivity", 300), + ] + ) + + assert element.size == 1700 + + libraries = _find_child(element, "Libraries") + assert libraries is not None + assert libraries.size == 1400 + # Libraries are sorted by descending size (Sentry 900 before AndroidX 500). + assert [child.name for child in libraries.children] == ["Sentry", "AndroidX"]