Signal-Android/reproducible-builds/apkdiff/apkdiff.py
2026-06-10 15:20:00 -04:00

327 lines
11 KiB
Python
Executable File

#! /usr/bin/env python3
import difflib
import subprocess
import sys
import re
import logging
from xml.etree.ElementTree import Element
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from typing import Optional
from androguard.core import axml
from loguru import logger
logging.getLogger("deepdiff").setLevel(logging.ERROR)
logger.disable("androguard")
@dataclass
class XmlDifference:
"""Represents a difference between two XML elements."""
diff_type: str # "tag", "attribute", "text", "child_count"
path: str
attribute_name: Optional[str] = None
first_value: Optional[str] = None
second_value: Optional[str] = None
child_tag: Optional[str] = None
IGNORE_FILES = [
# Related to app signing. Not expected to be present in unsigned builds. Doesn"t affect app code.
"META-INF/MANIFEST.MF",
"META-INF/CERTIFIC.SF",
"META-INF/CERTIFIC.RSA",
"META-INF/TEXTSECU.SF",
"META-INF/TEXTSECU.RSA",
"META-INF/BNDLTOOL.SF",
"META-INF/BNDLTOOL.RSA",
"META-INF/code_transparency_signed.jwt",
"stamp-cert-sha256",
]
ALLOWED_ARSC_DIFF_PATHS = [".res1"]
def compare(apk1, apk2) -> bool:
print(f"Comparing: \n\t{apk1}\n\t{apk2}\n")
print("Unzipping...")
zip1 = ZipFile(apk1, "r")
zip2 = ZipFile(apk2, "r")
entry_names = compare_entry_names(zip1, zip2)
entry_contents = compare_entry_contents(zip1, zip2)
# Some splits (e.g. ABI config splits) contain no resource table. Compare when both APKs have one, treat both
# missing as a match, and fail if only one of them has it.
has_arsc_1 = "resources.arsc" in zip1.namelist()
has_arsc_2 = "resources.arsc" in zip2.namelist()
if has_arsc_1 and has_arsc_2:
resources = compare_resources_arsc(apk1, apk2)
elif has_arsc_1 != has_arsc_2:
print("resources.arsc is present in only one of the APKs!")
resources = False
else:
resources = True
return entry_names and entry_contents and resources
def compare_entry_names(zip1: ZipFile, zip2: ZipFile) -> bool:
print("Comparing zip entry names...")
name_list_sorted_1 = sorted(zip1.namelist())
name_list_sorted_2 = sorted(zip2.namelist())
for ignoreFile in IGNORE_FILES:
while ignoreFile in name_list_sorted_1:
name_list_sorted_1.remove(ignoreFile)
while ignoreFile in name_list_sorted_2:
name_list_sorted_2.remove(ignoreFile)
success = True
if len(name_list_sorted_1) != len(name_list_sorted_2):
print(f"Manifest lengths differ! {len(name_list_sorted_1)} vs {len(name_list_sorted_2)}")
success = False
only_in_first = sorted(list(set(name_list_sorted_1) - set(name_list_sorted_2)))
only_in_second = sorted(list(set(name_list_sorted_2) - set(name_list_sorted_1)))
if only_in_first:
print(f"Files present only in {zip1.filename}:")
for name in only_in_first:
print(f" - {name}")
success = False
if only_in_second:
print(f"Files present only in {zip2.filename}:")
for name in only_in_second:
print(f" - {name}")
success = False
# If sets are identical but ordering differs, still report ordering mismatches
if success:
for entry_name_1, entry_name_2 in zip(name_list_sorted_1, name_list_sorted_2):
if entry_name_1 != entry_name_2:
print(f"Sorted manifests don't match: {entry_name_1} vs {entry_name_2}")
success = False
return success
def compare_entry_contents(zip1: ZipFile, zip2: ZipFile) -> bool:
print("Comparing zip entry contents...")
info_list_1 = list(filter(lambda info: info.filename not in IGNORE_FILES, zip1.infolist()))
info_list_2 = list(filter(lambda info: info.filename not in IGNORE_FILES, zip2.infolist()))
success = True
if len(info_list_1) != len(info_list_2):
print(f"APK info lists of different length! {len(info_list_1)} vs {len(info_list_2)}")
success = False
for entry_info_1 in info_list_1:
for entry_info_2 in list(info_list_2):
if entry_info_1.filename == entry_info_2.filename:
entry_bytes_1 = zip1.read(entry_info_1.filename)
entry_bytes_2 = zip2.read(entry_info_2.filename)
if entry_bytes_1 != entry_bytes_2 and not handle_special_cases(entry_info_1.filename, entry_bytes_1, entry_bytes_2):
zip1.extract(entry_info_1, "mismatches/first")
zip2.extract(entry_info_2, "mismatches/second")
print(f"APKs differ on file {entry_info_1.filename}! Files extracted to the mismatches/ directory.")
success = False
info_list_2.remove(entry_info_2)
break
return success
def handle_special_cases(filename: str, bytes1: bytes, bytes2: bytes):
"""
There are some specific files that expect will not be byte-for-byte identical. We want to ensure that the files
are matching except these expected differences. The differences are all related to extra XML attributes that the
Play Store may add as part of the bundle process. These differences do not affect the behavior of the app and are
unfortunately unavoidable given the modern realities of the Play Store.
"""
if filename == "AndroidManifest.xml":
print("Comparing AndroidManifest.xml...")
return compare_android_xml(bytes1, bytes2)
elif filename == "resources.arsc":
# we will compare resources.arsc separately with aapt2, so we can ignore any differences here
return True
elif re.match("res/xml/splits[0-9]+\\.xml", filename):
print(f"Comparing {filename}...")
return compare_split_xml(bytes1, bytes2)
return False
def compare_android_xml(bytes1: bytes, bytes2: bytes) -> bool:
all_differences = compare_xml(bytes1, bytes2)
bad_differences = []
for diff in all_differences:
is_split_attr = diff.diff_type == "attribute" and diff.path in ["manifest", "manifest/application"] and diff.attribute_name is not None and "split" in diff.attribute_name.lower()
is_meta_attr = diff.diff_type == "attribute" and diff.path == "manifest/application/meta-data"
is_meta_child_count = diff.diff_type == "child_count" and diff.child_tag == "meta-data"
if not is_split_attr and not is_meta_attr and not is_meta_child_count:
bad_differences.append(diff)
if bad_differences:
print(bad_differences)
return False
return True
def compare_split_xml(bytes1: bytes, bytes2: bytes) -> bool:
all_differences = compare_xml(bytes1, bytes2)
bad_differences = []
for diff in all_differences:
is_language = diff.diff_type == "attribute" and diff.path == "splits/module/language/entry"
if not is_language:
bad_differences.append(diff)
if bad_differences:
print(bad_differences)
return False
return True
def compare_resources_arsc(apk1: str, apk2: str) -> bool:
"""
Compares two resources.arsc files.
"""
print("Comparing resources.arsc...")
resources1 = dump_resources(apk1)
resources2 = dump_resources(apk2)
if resources1 == resources2:
return True
else:
print("resources.arsc files differ!")
diff = difflib.unified_diff(
resources1,
resources2,
fromfile=apk1,
tofile=apk2,
lineterm="",
)
for line in diff:
print(line)
return False
def dump_resources(apk):
try:
with subprocess.Popen(
['aapt2', 'dump', 'resources', apk],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
) as process:
stdout, stderr = process.communicate()
if process.returncode != 0:
raise RuntimeError(f"aapt2 failed with error: {stderr.strip()}")
except FileNotFoundError:
raise RuntimeError("aapt2 is not installed or not in the PATH.")
return stdout.strip().splitlines()
def compare_xml(bytes1: bytes, bytes2: bytes) -> list[XmlDifference]:
printer = axml.AXMLPrinter(bytes1)
entry_text_1 = printer.get_xml().decode("utf-8")
printer = axml.AXMLPrinter(bytes2)
entry_text_2 = printer.get_xml().decode("utf-8")
if entry_text_1 == entry_text_2:
return []
root1 = ET.fromstring(entry_text_1)
root2 = ET.fromstring(entry_text_2)
return compare_xml_elements(root1, root2)
def compare_xml_elements(elem1: Element, elem2: Element, path: str = "") -> list[XmlDifference]:
"""Recursively compare two XML elements and return list of XmlDifference objects."""
differences: list[XmlDifference] = []
# Build current path
current_path = f"{path}/{elem1.tag}" if path else elem1.tag
# Compare tags
if elem1.tag != elem2.tag:
differences.append(XmlDifference(diff_type="tag", path=path, first_value=elem1.tag, second_value=elem2.tag))
return differences
# Compare attributes
attrs1 = elem1.attrib
attrs2 = elem2.attrib
all_keys = set(attrs1.keys()) | set(attrs2.keys())
for key in sorted(all_keys):
val1 = attrs1.get(key)
val2 = attrs2.get(key)
if val1 != val2:
differences.append(XmlDifference(diff_type="attribute", path=current_path, attribute_name=key, first_value=val1, second_value=val2))
# Compare text content
text1 = (elem1.text or "").strip()
text2 = (elem2.text or "").strip()
if text1 != text2:
differences.append(XmlDifference(diff_type="text", path=current_path, first_value=text1, second_value=text2))
# Compare children
children1 = list(elem1)
children2 = list(elem2)
# Try to match children by tag name for comparison
children1_by_tag: dict[str, list[Element]] = {}
for child in children1:
children1_by_tag.setdefault(child.tag, []).append(child)
children2_by_tag: dict[str, list[Element]] = {}
for child in children2:
children2_by_tag.setdefault(child.tag, []).append(child)
# Compare children with matching tags
all_child_tags = set(children1_by_tag.keys()) | set(children2_by_tag.keys())
for tag in sorted(all_child_tags):
list1 = children1_by_tag.get(tag, [])
list2 = children2_by_tag.get(tag, [])
if len(list1) != len(list2):
differences.append(XmlDifference(diff_type="child_count", path=current_path, child_tag=tag, first_value=str(len(list1)), second_value=str(len(list2))))
# Compare matching elements recursively
for child1, child2 in zip(list1, list2):
differences.extend(compare_xml_elements(child1, child2, current_path))
return differences
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: apkdiff <pathToFirstApk> <pathToSecondApk>")
sys.exit(1)
if compare(sys.argv[1], sys.argv[2]):
print("APKs match!")
sys.exit(0)
else:
print("APKs don't match!")
sys.exit(1)