From 2f866beb00634e4f5cc712429b176a563feb76df Mon Sep 17 00:00:00 2001 From: george-signal <98181642+george-signal@users.noreply.github.com> Date: Wed, 20 Jul 2022 11:19:38 -0700 Subject: [PATCH] Change how excessive diacriticals are removed. Fixes a bug where some emoji were mistaken for zalgo, causing all diacritics in a message to get stripped. Instead, replace "risky" characters with too many diacritics with the unicode replacement character, leaving others alone. --- SignalCoreKit/src/NSString+OWS.m | 62 +++++++------------ SignalCoreKit/src/StringSanitizer.swift | 58 +++++++++++++++++ .../src/StringSanitizerTests.swift | 60 ++++++++++++++++++ 3 files changed, 139 insertions(+), 41 deletions(-) create mode 100644 SignalCoreKit/src/StringSanitizer.swift create mode 100644 SignalCoreKitTests/src/StringSanitizerTests.swift diff --git a/SignalCoreKit/src/NSString+OWS.m b/SignalCoreKit/src/NSString+OWS.m index 3487e8f..f2a4129 100644 --- a/SignalCoreKit/src/NSString+OWS.m +++ b/SignalCoreKit/src/NSString+OWS.m @@ -40,7 +40,8 @@ NS_ASSUME_NONNULL_BEGIN #pragma mark - -static void *kNSString_SSK_hasExcessiveDiacriticals = &kNSString_SSK_hasExcessiveDiacriticals; +static void *kNSString_SSK_needsSanitization = &kNSString_SSK_needsSanitization; +static void *kNSString_SSK_sanitizedCounterpart = &kNSString_SSK_sanitizedCounterpart; static unichar bidiLeftToRightIsolate = 0x2066; static unichar bidiRightToLeftIsolate = 0x2067; static unichar bidiFirstStrongIsolate = 0x2068; @@ -230,7 +231,7 @@ static unichar bidiPopDirectionalIsolate = 0x2069; - (NSString *)filterSubstringForDisplay { // We don't want to strip a substring before filtering. - return self.filterForIndicScripts.filterForExcessiveDiacriticals.ensureBalancedBidiControlCharacters; + return self.filterForIndicScripts.sanitized.ensureBalancedBidiControlCharacters; } - (NSString *)filterStringForDisplay @@ -240,7 +241,7 @@ static unichar bidiPopDirectionalIsolate = 0x2069; - (NSString *)filterFilename { - return self.ows_stripped.filterForIndicScripts.filterForExcessiveDiacriticals.filterUnsafeFilenameCharacters; + return self.ows_stripped.filterForIndicScripts.sanitized.filterUnsafeFilenameCharacters; } - (NSString *)withoutBidiControlCharacters @@ -331,47 +332,26 @@ static unichar bidiPopDirectionalIsolate = 0x2069; return [NSString stringWithFormat:@"%C%@%C", bidiFirstStrongIsolate, self.ensureBalancedBidiControlCharacters, bidiPopDirectionalIsolate]; } -- (NSString *)filterForExcessiveDiacriticals +- (NSString *)sanitized { - if (!self.hasExcessiveDiacriticals) { + NSNumber *cachedNeedsSanitization = objc_getAssociatedObject(self, kNSString_SSK_needsSanitization); + if (cachedNeedsSanitization != nil) { + if (cachedNeedsSanitization.boolValue) { + return objc_getAssociatedObject(self, kNSString_SSK_sanitizedCounterpart) ?: self; + } else { + return self; + } + } + + StringSanitizer *sanitizer = [[StringSanitizer alloc] initWithString:self]; + const BOOL needsSanitization = sanitizer.needsSanitization; + objc_setAssociatedObject(self, kNSString_SSK_needsSanitization, @(needsSanitization), OBJC_ASSOCIATION_COPY); + if (!needsSanitization) { return self; } - return [self stringByFoldingWithOptions:NSDiacriticInsensitiveSearch locale:[NSLocale currentLocale]]; -} - -- (BOOL)hasExcessiveDiacriticals -{ - NSNumber *cachedValue = objc_getAssociatedObject(self, kNSString_SSK_hasExcessiveDiacriticals); - if (!cachedValue) { - cachedValue = @([self computeHasExcessiveDiacriticals]); - objc_setAssociatedObject(self, kNSString_SSK_hasExcessiveDiacriticals, cachedValue, OBJC_ASSOCIATION_COPY); - } - - return cachedValue.boolValue; -} - -- (BOOL)computeHasExcessiveDiacriticals -{ - // discard any zalgo style text, by detecting maximum number of glyphs per character - NSUInteger index = 0; - - // store in local var, it's a hot code path. - NSUInteger length = self.length; - while (index < length) { - // Walk the grapheme clusters in the string. - NSRange range = [self rangeOfComposedCharacterSequenceAtIndex:index]; - if (range.length > 8) { - // There are too many characters in this grapheme cluster. - return YES; - } else if (range.location != index || range.length < 1) { - // This should never happen. - OWSFailDebug( - @"unexpected composed character sequence: %lu, %@", (unsigned long)index, NSStringFromRange(range)); - return YES; - } - index = range.location + range.length; - } - return NO; + NSString *sanitized = sanitizer.sanitized; + objc_setAssociatedObject(self, kNSString_SSK_sanitizedCounterpart, sanitized, OBJC_ASSOCIATION_COPY); + return sanitized; } + (NSRegularExpression *)anyASCIIRegex diff --git a/SignalCoreKit/src/StringSanitizer.swift b/SignalCoreKit/src/StringSanitizer.swift new file mode 100644 index 0000000..3917e34 --- /dev/null +++ b/SignalCoreKit/src/StringSanitizer.swift @@ -0,0 +1,58 @@ +// +// Copyright (c) 2022 Open Whisper Systems. All rights reserved. +// + +import Foundation + +/// Replaces extended grapheme clusters having too many combining marks with the unicode replacement character. +/// +/// Example usage: +/// ``` +/// let sanitizer = StringSanitizer("Jack said, “H̴̬̪̤̗̪̳̑̓e̵̱̗͇̰̽̊͛̿̒̚͠r̶̨̯̻̹̪̫̣̪̹͇̗̀͌̃̍̄͗̎͊͌ę̶̣͍̗̘̺̪̱̇̈́̈́͗͌̀̊̏ͅ'̷̧̧̭̜̱̜͉̟͇̣̉̃ͅs̸̪̻̯͔̤̣̱̾̽̌̇̃̒͋͂̈́̀͌̍̚ ̶͙́̓͊̈́̉̂͗̆͗̑͂̕J̵̨̧̧̠̩͈̹͈̦̩̣͙͐̿̇̈́̓ͅͅo̵̡̥̪͘h̵̡̧̢̘̟͓͖̤̼̟̺͓̰͈͓̎͋̎͝ņ̶̛͖̻̻̝͗̃͋͠n̶̮͈̯̩̘̠̻͔̈̌̐͘̚͝y̵̧̡̛͙͈̹̹̹̗̤̙͖̜̰̰͌͆̏̑͐̽̍͜!̸̡͈͔͆”) +/// if (sanitizer.needsSanitization) { +/// print(sanitizer.sanitized); // Jack said, “��������������” +/// } +/// ``` +@objc class StringSanitizer: NSObject { + private static let maxCodePoints = 16 + private let string: String + + @objc(initWithString:) + init(_ string: String) { + self.string = string + } + + /// Indicates if the string needs to be modified. This is slightly cheaper than calling `sanitized`. + @objc lazy var needsSanitization: Bool = { + return string.contains { + $0.unicodeScalars.count > Self.maxCodePoints + } + }() + + /// Returns a modified version of the string if sanitization is needed, or the original string otherwise. + @objc lazy var sanitized: String = { + if !needsSanitization { + return string + } + precondition(!string.isEmpty) + return sanitize(string) + }() + + private func isBad(_ c: Character) -> Bool { + return c.unicodeScalars.count > Self.maxCodePoints + } + + private func sanitize(_ original: String) -> String { + var remaining = original[...] + var result = "" + // An overestimate, because we will shorten at least one Character. + result.reserveCapacity(original.utf8.count) + while let nextBadCharIndex = remaining.firstIndex(where: isBad) { + result.append(contentsOf: remaining[..