From 98fd924a7f3d1dae5368e8a80ac629807ae54bc4 Mon Sep 17 00:00:00 2001
From: Sagar Dagdu <shags032@gmail.com>
Date: Fri, 8 May 2026 07:19:55 +0530
Subject: [PATCH] fix: prefer structured typedstream prefix decoding

Fix typedstream attributedBody recovery for 32-126 byte messages whose length byte is printable ASCII, and keep the regression covered across the parser edge cases.\n\nCo-authored-by: Sagar Dagdu <shags032@gmail.com>
---
 CHANGELOG.md                             |  4 +++
 Sources/IMsgCore/TypedStreamParser.swift | 33 ++++++++++++--------
 Tests/IMsgCoreTests/UtilityTests.swift   | 39 ++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 13 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd0baf3..1b38824 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,10 @@
 - docs: document Linux as read-only support for existing copied Messages
   databases.
 
+### Message Decoding
+- fix: strip printable typedstream length bytes from recovered `attributedBody`
+  text for 32-126 byte messages (#107, thanks @SagarSDagdu).
+
 ## 0.7.3 - 2026-05-06
 
 ### Private API Bridge
diff --git a/Sources/IMsgCore/TypedStreamParser.swift b/Sources/IMsgCore/TypedStreamParser.swift
index 7b939ec..d65df33 100644
--- a/Sources/IMsgCore/TypedStreamParser.swift
+++ b/Sources/IMsgCore/TypedStreamParser.swift
@@ -38,36 +38,43 @@ enum TypedStreamParser {
   }
 
   /// Strips a typedstream length prefix from `segment` and returns the longest valid UTF-8 decoding.
-  /// Length prefix forms (BER-style): single byte (< 0x80), `0x81 NN`, or `0x82 NN NN`. The older
-  /// implementation only handled the single-byte form, which silently dropped any message longer
-  /// than 127 bytes because the unstripped 0x81/0x82 byte is invalid as a UTF-8 leading byte.
+  /// Length prefix forms (BER-style): single byte (< 0x80), `0x81 NN`, or `0x82 NN NN`.
+  /// Structured prefixes always win over the raw `prefixLen = 0` decode: otherwise, when the
+  /// length byte is itself a printable-ASCII character (body length 32–126), the unstripped decode
+  /// produces an N+1 character string that beats the correct N-character body.
   private static func decodeSegment(_ segment: [UInt8]) -> String {
     guard let first = segment.first else { return "" }
 
-    var prefixLengths: Set<Int> = [0]
+    var structuredPrefixes: [Int] = []
     if first < 0x80, Int(first) == segment.count - 1 {
-      prefixLengths.insert(1)
+      structuredPrefixes.append(1)
     }
     if first == 0x81, segment.count >= 2 {
-      prefixLengths.insert(2)
+      structuredPrefixes.append(2)
     }
     if first == 0x82, segment.count >= 3 {
-      prefixLengths.insert(3)
+      structuredPrefixes.append(3)
     }
 
-    var best = ""
-    for prefixLen in prefixLengths {
-      guard prefixLen <= segment.count else { continue }
+    var bestStructured = ""
+    var anyStructuredValid = false
+    for prefixLen in structuredPrefixes {
       let body = Array(segment[prefixLen...])
       guard
         let candidate = String(bytes: body, encoding: .utf8)?
           .trimmingLeadingControlCharacters()
       else { continue }
-      if candidate.count > best.count {
-        best = candidate
+      anyStructuredValid = true
+      if candidate.count > bestStructured.count {
+        bestStructured = candidate
       }
     }
-    return best
+    if anyStructuredValid {
+      return bestStructured
+    }
+
+    return String(bytes: segment, encoding: .utf8)?
+      .trimmingLeadingControlCharacters() ?? ""
   }
 
   private static func findSequence(_ needle: [UInt8], in haystack: [UInt8], from start: Int)
diff --git a/Tests/IMsgCoreTests/UtilityTests.swift b/Tests/IMsgCoreTests/UtilityTests.swift
index 5515e25..595ab50 100644
--- a/Tests/IMsgCoreTests/UtilityTests.swift
+++ b/Tests/IMsgCoreTests/UtilityTests.swift
@@ -237,6 +237,45 @@ func typedStreamParserDecodesLongMessageWith0x82Prefix() {
   #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
 }
 
+@Test
+func typedStreamParserDoesNotPrependPrintableAsciiLengthByte() {
+  // 64-byte body of 'A' → length byte 0x40 ('@'), printable.
+  // Without the structured-prefix-wins rule, the raw decode keeps the '@' and beats the stripped body by one character.
+  let text = String(repeating: "A", count: 64)
+  let bytes: [UInt8] =
+    [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
+  #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
+}
+
+@Test
+func typedStreamParserDecodes32ByteBodyAtLowerRegressionEdge() {
+  // 32-byte body → length byte 0x20 (space). Lower edge of the 32–126 printable-ASCII window.
+  let text = String(repeating: "A", count: 32)
+  let bytes: [UInt8] =
+    [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
+  #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
+}
+
+@Test
+func typedStreamParserDecodes126ByteBodyAtUpperRegressionEdge() {
+  // 126-byte body → length byte 0x7E ('~'). Upper edge of the window — 0x7F is DEL/control and
+  // would be trimmed (not prepended), so 0x7E is the precise top of the failure range.
+  let text = String(repeating: "A", count: 126)
+  let bytes: [UInt8] =
+    [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
+  #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
+}
+
+@Test
+func typedStreamParserDecodesMultibyteUTF8BodyInRegressionWindow() {
+  // 12 × 🎉 = 48 UTF-8 bytes → length byte 0x30 ('0'), printable. Confirms the structured-prefix
+  // preference works for non-ASCII bodies too — the bug is byte-count driven, not ASCII-specific.
+  let text = String(repeating: "🎉", count: 12)
+  let bytes: [UInt8] =
+    [0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
+  #expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
+}
+
 @Test
 func typedStreamParserHandlesMixedBinaryNoise() {
   // First byte 0x42 is neither 0x81 nor 0x82, and does not equal segment.count - 1 (= 6).