fix: prefer structured typedstream prefix decoding
Fix typedstream attributedBody recovery for 32-126 byte messages whose length byte is printable ASCII, and keep the regression covered across the parser edge cases.\n\nCo-authored-by: Sagar Dagdu <shags032@gmail.com>
This commit is contained in:
parent
0d1ca83815
commit
98fd924a7f
@ -9,6 +9,10 @@
|
||||
- docs: document Linux as read-only support for existing copied Messages
|
||||
databases.
|
||||
|
||||
### Message Decoding
|
||||
- fix: strip printable typedstream length bytes from recovered `attributedBody`
|
||||
text for 32-126 byte messages (#107, thanks @SagarSDagdu).
|
||||
|
||||
## 0.7.3 - 2026-05-06
|
||||
|
||||
### Private API Bridge
|
||||
|
||||
@ -38,36 +38,43 @@ enum TypedStreamParser {
|
||||
}
|
||||
|
||||
/// Strips a typedstream length prefix from `segment` and returns the longest valid UTF-8 decoding.
|
||||
/// Length prefix forms (BER-style): single byte (< 0x80), `0x81 NN`, or `0x82 NN NN`. The older
|
||||
/// implementation only handled the single-byte form, which silently dropped any message longer
|
||||
/// than 127 bytes because the unstripped 0x81/0x82 byte is invalid as a UTF-8 leading byte.
|
||||
/// Length prefix forms (BER-style): single byte (< 0x80), `0x81 NN`, or `0x82 NN NN`.
|
||||
/// Structured prefixes always win over the raw `prefixLen = 0` decode: otherwise, when the
|
||||
/// length byte is itself a printable-ASCII character (body length 32–126), the unstripped decode
|
||||
/// produces an N+1 character string that beats the correct N-character body.
|
||||
private static func decodeSegment(_ segment: [UInt8]) -> String {
|
||||
guard let first = segment.first else { return "" }
|
||||
|
||||
var prefixLengths: Set<Int> = [0]
|
||||
var structuredPrefixes: [Int] = []
|
||||
if first < 0x80, Int(first) == segment.count - 1 {
|
||||
prefixLengths.insert(1)
|
||||
structuredPrefixes.append(1)
|
||||
}
|
||||
if first == 0x81, segment.count >= 2 {
|
||||
prefixLengths.insert(2)
|
||||
structuredPrefixes.append(2)
|
||||
}
|
||||
if first == 0x82, segment.count >= 3 {
|
||||
prefixLengths.insert(3)
|
||||
structuredPrefixes.append(3)
|
||||
}
|
||||
|
||||
var best = ""
|
||||
for prefixLen in prefixLengths {
|
||||
guard prefixLen <= segment.count else { continue }
|
||||
var bestStructured = ""
|
||||
var anyStructuredValid = false
|
||||
for prefixLen in structuredPrefixes {
|
||||
let body = Array(segment[prefixLen...])
|
||||
guard
|
||||
let candidate = String(bytes: body, encoding: .utf8)?
|
||||
.trimmingLeadingControlCharacters()
|
||||
else { continue }
|
||||
if candidate.count > best.count {
|
||||
best = candidate
|
||||
anyStructuredValid = true
|
||||
if candidate.count > bestStructured.count {
|
||||
bestStructured = candidate
|
||||
}
|
||||
}
|
||||
return best
|
||||
if anyStructuredValid {
|
||||
return bestStructured
|
||||
}
|
||||
|
||||
return String(bytes: segment, encoding: .utf8)?
|
||||
.trimmingLeadingControlCharacters() ?? ""
|
||||
}
|
||||
|
||||
private static func findSequence(_ needle: [UInt8], in haystack: [UInt8], from start: Int)
|
||||
|
||||
@ -237,6 +237,45 @@ func typedStreamParserDecodesLongMessageWith0x82Prefix() {
|
||||
#expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
|
||||
}
|
||||
|
||||
@Test
|
||||
func typedStreamParserDoesNotPrependPrintableAsciiLengthByte() {
|
||||
// 64-byte body of 'A' → length byte 0x40 ('@'), printable.
|
||||
// Without the structured-prefix-wins rule, the raw decode keeps the '@' and beats the stripped body by one character.
|
||||
let text = String(repeating: "A", count: 64)
|
||||
let bytes: [UInt8] =
|
||||
[0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
|
||||
#expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
|
||||
}
|
||||
|
||||
@Test
|
||||
func typedStreamParserDecodes32ByteBodyAtLowerRegressionEdge() {
|
||||
// 32-byte body → length byte 0x20 (space). Lower edge of the 32–126 printable-ASCII window.
|
||||
let text = String(repeating: "A", count: 32)
|
||||
let bytes: [UInt8] =
|
||||
[0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
|
||||
#expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
|
||||
}
|
||||
|
||||
@Test
|
||||
func typedStreamParserDecodes126ByteBodyAtUpperRegressionEdge() {
|
||||
// 126-byte body → length byte 0x7E ('~'). Upper edge of the window — 0x7F is DEL/control and
|
||||
// would be trimmed (not prepended), so 0x7E is the precise top of the failure range.
|
||||
let text = String(repeating: "A", count: 126)
|
||||
let bytes: [UInt8] =
|
||||
[0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
|
||||
#expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
|
||||
}
|
||||
|
||||
@Test
|
||||
func typedStreamParserDecodesMultibyteUTF8BodyInRegressionWindow() {
|
||||
// 12 × 🎉 = 48 UTF-8 bytes → length byte 0x30 ('0'), printable. Confirms the structured-prefix
|
||||
// preference works for non-ASCII bodies too — the bug is byte-count driven, not ASCII-specific.
|
||||
let text = String(repeating: "🎉", count: 12)
|
||||
let bytes: [UInt8] =
|
||||
[0x01, 0x2b, UInt8(text.utf8.count)] + Array(text.utf8) + [0x86, 0x84]
|
||||
#expect(TypedStreamParser.parseAttributedBody(Data(bytes)) == text)
|
||||
}
|
||||
|
||||
@Test
|
||||
func typedStreamParserHandlesMixedBinaryNoise() {
|
||||
// First byte 0x42 is neither 0x81 nor 0x82, and does not equal segment.count - 1 (= 6).
|
||||
|
||||
Loading…
Reference in New Issue
Block a user