From 98fd924a7f3d1dae5368e8a80ac629807ae54bc4 Mon Sep 17 00:00:00 2001 From: Sagar Dagdu Date: Fri, 8 May 2026 07:19:55 +0530 Subject: [PATCH] fix: prefer structured typedstream prefix decoding Fix typedstream attributedBody recovery for 32-126 byte messages whose length byte is printable ASCII, and keep the regression covered across the parser edge cases.\n\nCo-authored-by: Sagar Dagdu --- CHANGELOG.md | 4 +++ Sources/IMsgCore/TypedStreamParser.swift | 33 ++++++++++++-------- Tests/IMsgCoreTests/UtilityTests.swift | 39 ++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd0baf3..1b38824 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ - docs: document Linux as read-only support for existing copied Messages databases. +### Message Decoding +- fix: strip printable typedstream length bytes from recovered `attributedBody` + text for 32-126 byte messages (#107, thanks @SagarSDagdu). + ## 0.7.3 - 2026-05-06 ### Private API Bridge diff --git a/Sources/IMsgCore/TypedStreamParser.swift b/Sources/IMsgCore/TypedStreamParser.swift index 7b939ec..d65df33 100644 --- a/Sources/IMsgCore/TypedStreamParser.swift +++ b/Sources/IMsgCore/TypedStreamParser.swift @@ -38,36 +38,43 @@ enum TypedStreamParser { } /// Strips a typedstream length prefix from `segment` and returns the longest valid UTF-8 decoding. - /// Length prefix forms (BER-style): single byte (< 0x80), `0x81 NN`, or `0x82 NN NN`. The older - /// implementation only handled the single-byte form, which silently dropped any message longer - /// than 127 bytes because the unstripped 0x81/0x82 byte is invalid as a UTF-8 leading byte. + /// Length prefix forms (BER-style): single byte (< 0x80), `0x81 NN`, or `0x82 NN NN`. + /// Structured prefixes always win over the raw `prefixLen = 0` decode: otherwise, when the + /// length byte is itself a printable-ASCII character (body length 32–126), the unstripped decode + /// produces an N+1 character string that beats the correct N-character body. private static func decodeSegment(_ segment: [UInt8]) -> String { guard let first = segment.first else { return "" } - var prefixLengths: Set = [0] + var structuredPrefixes: [Int] = [] if first < 0x80, Int(first) == segment.count - 1 { - prefixLengths.insert(1) + structuredPrefixes.append(1) } if first == 0x81, segment.count >= 2 { - prefixLengths.insert(2) + structuredPrefixes.append(2) } if first == 0x82, segment.count >= 3 { - prefixLengths.insert(3) + structuredPrefixes.append(3) } - var best = "" - for prefixLen in prefixLengths { - guard prefixLen <= segment.count else { continue } + var bestStructured = "" + var anyStructuredValid = false + for prefixLen in structuredPrefixes { let body = Array(segment[prefixLen...]) guard let candidate = String(bytes: body, encoding: .utf8)? .trimmingLeadingControlCharacters() else { continue } - if candidate.count > best.count { - best = candidate + anyStructuredValid = true + if candidate.count > bestStructured.count { + bestStructured = candidate } } - return best + if anyStructuredValid { + return bestStructured + } + + return String(bytes: segment, encoding: .utf8)? + .trimmingLeadingControlCharacters() ?? "" } private static func findSequence(_ needle: [UInt8], in haystack: [UInt8], from start: Int) diff --git a/Tests/IMsgCoreTests/UtilityTests.swift b/Tests/IMsgCoreTests/UtilityTests.swift index 5515e25..595ab50 100644 --- a/Tests/IMsgCoreTests/UtilityTests.swift +++ b/Tests/IMsgCoreTests/UtilityTests.swift @@ -237,6 +237,45 @@ func typedStreamParserDecodesLongMessageWith0x82Prefix() { #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text) } +@Test +func typedStreamParserDoesNotPrependPrintableAsciiLengthByte() { + // 64-byte body of 'A' → length byte 0x40 ('@'), printable. + // Without the structured-prefix-wins rule, the raw decode keeps the '@' and beats the stripped body by one character. + let text = String(repeating: "A", count: 64) + let bytes: [UInt8] = + [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84] + #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text) +} + +@Test +func typedStreamParserDecodes32ByteBodyAtLowerRegressionEdge() { + // 32-byte body → length byte 0x20 (space). Lower edge of the 32–126 printable-ASCII window. + let text = String(repeating: "A", count: 32) + let bytes: [UInt8] = + [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84] + #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text) +} + +@Test +func typedStreamParserDecodes126ByteBodyAtUpperRegressionEdge() { + // 126-byte body → length byte 0x7E ('~'). Upper edge of the window — 0x7F is DEL/control and + // would be trimmed (not prepended), so 0x7E is the precise top of the failure range. + let text = String(repeating: "A", count: 126) + let bytes: [UInt8] = + [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84] + #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text) +} + +@Test +func typedStreamParserDecodesMultibyteUTF8BodyInRegressionWindow() { + // 12 × 🎉 = 48 UTF-8 bytes → length byte 0x30 ('0'), printable. Confirms the structured-prefix + // preference works for non-ASCII bodies too — the bug is byte-count driven, not ASCII-specific. + let text = String(repeating: "🎉", count: 12) + let bytes: [UInt8] = + [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84] + #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text) +} + @Test func typedStreamParserHandlesMixedBinaryNoise() { // First byte 0x42 is neither 0x81 nor 0x82, and does not equal segment.count - 1 (= 6).