Change how excessive diacriticals are removed.

Fixes a bug where some emoji were mistaken for
zalgo, causing all diacritics in a message to get
stripped. Instead, replace "risky" characters with
too many diacritics with the unicode replacement
character, leaving others alone.
This commit is contained in:
george-signal 2022-07-20 11:19:38 -07:00 committed by GitHub
parent 603f75807e
commit 2f866beb00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 139 additions and 41 deletions

View File

@ -40,7 +40,8 @@ NS_ASSUME_NONNULL_BEGIN
#pragma mark -
static void *kNSString_SSK_hasExcessiveDiacriticals = &kNSString_SSK_hasExcessiveDiacriticals;
static void *kNSString_SSK_needsSanitization = &kNSString_SSK_needsSanitization;
static void *kNSString_SSK_sanitizedCounterpart = &kNSString_SSK_sanitizedCounterpart;
static unichar bidiLeftToRightIsolate = 0x2066;
static unichar bidiRightToLeftIsolate = 0x2067;
static unichar bidiFirstStrongIsolate = 0x2068;
@ -230,7 +231,7 @@ static unichar bidiPopDirectionalIsolate = 0x2069;
- (NSString *)filterSubstringForDisplay
{
// We don't want to strip a substring before filtering.
return self.filterForIndicScripts.filterForExcessiveDiacriticals.ensureBalancedBidiControlCharacters;
return self.filterForIndicScripts.sanitized.ensureBalancedBidiControlCharacters;
}
- (NSString *)filterStringForDisplay
@ -240,7 +241,7 @@ static unichar bidiPopDirectionalIsolate = 0x2069;
- (NSString *)filterFilename
{
return self.ows_stripped.filterForIndicScripts.filterForExcessiveDiacriticals.filterUnsafeFilenameCharacters;
return self.ows_stripped.filterForIndicScripts.sanitized.filterUnsafeFilenameCharacters;
}
- (NSString *)withoutBidiControlCharacters
@ -331,47 +332,26 @@ static unichar bidiPopDirectionalIsolate = 0x2069;
return [NSString stringWithFormat:@"%C%@%C", bidiFirstStrongIsolate, self.ensureBalancedBidiControlCharacters, bidiPopDirectionalIsolate];
}
- (NSString *)filterForExcessiveDiacriticals
- (NSString *)sanitized
{
if (!self.hasExcessiveDiacriticals) {
NSNumber *cachedNeedsSanitization = objc_getAssociatedObject(self, kNSString_SSK_needsSanitization);
if (cachedNeedsSanitization != nil) {
if (cachedNeedsSanitization.boolValue) {
return objc_getAssociatedObject(self, kNSString_SSK_sanitizedCounterpart) ?: self;
} else {
return self;
}
}
StringSanitizer *sanitizer = [[StringSanitizer alloc] initWithString:self];
const BOOL needsSanitization = sanitizer.needsSanitization;
objc_setAssociatedObject(self, kNSString_SSK_needsSanitization, @(needsSanitization), OBJC_ASSOCIATION_COPY);
if (!needsSanitization) {
return self;
}
return [self stringByFoldingWithOptions:NSDiacriticInsensitiveSearch locale:[NSLocale currentLocale]];
}
- (BOOL)hasExcessiveDiacriticals
{
NSNumber *cachedValue = objc_getAssociatedObject(self, kNSString_SSK_hasExcessiveDiacriticals);
if (!cachedValue) {
cachedValue = @([self computeHasExcessiveDiacriticals]);
objc_setAssociatedObject(self, kNSString_SSK_hasExcessiveDiacriticals, cachedValue, OBJC_ASSOCIATION_COPY);
}
return cachedValue.boolValue;
}
- (BOOL)computeHasExcessiveDiacriticals
{
// discard any zalgo style text, by detecting maximum number of glyphs per character
NSUInteger index = 0;
// store in local var, it's a hot code path.
NSUInteger length = self.length;
while (index < length) {
// Walk the grapheme clusters in the string.
NSRange range = [self rangeOfComposedCharacterSequenceAtIndex:index];
if (range.length > 8) {
// There are too many characters in this grapheme cluster.
return YES;
} else if (range.location != index || range.length < 1) {
// This should never happen.
OWSFailDebug(
@"unexpected composed character sequence: %lu, %@", (unsigned long)index, NSStringFromRange(range));
return YES;
}
index = range.location + range.length;
}
return NO;
NSString *sanitized = sanitizer.sanitized;
objc_setAssociatedObject(self, kNSString_SSK_sanitizedCounterpart, sanitized, OBJC_ASSOCIATION_COPY);
return sanitized;
}
+ (NSRegularExpression *)anyASCIIRegex

View File

@ -0,0 +1,58 @@
//
// Copyright (c) 2022 Open Whisper Systems. All rights reserved.
//
import Foundation
/// Replaces extended grapheme clusters having too many combining marks with the unicode replacement character.
///
/// Example usage:
/// ```
/// let sanitizer = StringSanitizer("Jack said, H̴̬̪̤̗̪̳̑̓e̵̱̗͇̰̽̊͛̿̒̚͠r̶̨̯̻̹̪̫̣̪̹͇̗̀͌̃̍̄͗̎͊͌ę̶̣͍̗̘̺̪̱̇̈́̈́͗͌̀̊̏ͅ'̷̧̧̭̜̱̜͉̟͇̣̉̃ͅs̸̪̻̯͔̤̣̱̾̽̌̇̃̒͋͂̈́̀͌̍̚ ̶͙́̓͊̈́̉̂͗̆͗̑͂̕J̵̨̧̧̠̩͈̹͈̦̩̣͙͐̿̇̈́̓ͅͅo̵̡̥̪͘h̵̡̧̢̘̟͓͖̤̼̟̺͓̰͈͓̎͋̎͝ņ̶̛͖̻̻̝͗̃͋͠n̶̮͈̯̩̘̠̻͔̈̌̐͘̚͝y̵̧̡̛͙͈̹̹̹̗̤̙͖̜̰̰͌͆̏̑͐̽̍͜!̸̡͈͔͆)
/// if (sanitizer.needsSanitization) {
/// print(sanitizer.sanitized); // Jack said, <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
/// }
/// ```
@objc class StringSanitizer: NSObject {
private static let maxCodePoints = 16
private let string: String
@objc(initWithString:)
init(_ string: String) {
self.string = string
}
/// Indicates if the string needs to be modified. This is slightly cheaper than calling `sanitized`.
@objc lazy var needsSanitization: Bool = {
return string.contains {
$0.unicodeScalars.count > Self.maxCodePoints
}
}()
/// Returns a modified version of the string if sanitization is needed, or the original string otherwise.
@objc lazy var sanitized: String = {
if !needsSanitization {
return string
}
precondition(!string.isEmpty)
return sanitize(string)
}()
private func isBad(_ c: Character) -> Bool {
return c.unicodeScalars.count > Self.maxCodePoints
}
private func sanitize(_ original: String) -> String {
var remaining = original[...]
var result = ""
// An overestimate, because we will shorten at least one Character.
result.reserveCapacity(original.utf8.count)
while let nextBadCharIndex = remaining.firstIndex(where: isBad) {
result.append(contentsOf: remaining[..<nextBadCharIndex])
result.append("\u{FFFD}")
remaining = remaining[nextBadCharIndex...].dropFirst()
}
result.append(contentsOf: remaining)
return result
}
}

View File

@ -0,0 +1,60 @@
//
// Copyright (c) 2022 Open Whisper Systems. All rights reserved.
//
import XCTest
@testable import SignalCoreKit
class StringSanitizerTests: XCTestCase {
func testEmpty() {
let string = ""
let sanitizer = StringSanitizer(string)
XCTAssertFalse(sanitizer.needsSanitization)
XCTAssertEqual(sanitizer.sanitized, string)
}
func testASCII() {
let string = "abc"
let sanitizer = StringSanitizer(string)
XCTAssertFalse(sanitizer.needsSanitization)
XCTAssertEqual(sanitizer.sanitized, string)
}
func testCombiningMarks() {
let string = "abx̧c"
let sanitizer = StringSanitizer(string)
XCTAssertFalse(sanitizer.needsSanitization)
XCTAssertEqual(sanitizer.sanitized, string)
}
func testEmoji() {
let string = "a👩🏿💋👩🏻b"
let sanitizer = StringSanitizer(string)
XCTAssertFalse(sanitizer.needsSanitization)
XCTAssertEqual(sanitizer.sanitized, string)
}
func testZalgo() {
let string = "x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝abx̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝👩🏿💋👩🏻cx̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝"
let sanitizer = StringSanitizer(string)
XCTAssertTrue(sanitizer.needsSanitization)
let expected = "<EFBFBD>ab<EFBFBD><EFBFBD>👩🏿💋👩🏻c<EFBFBD>"
XCTAssertEqual(sanitizer.sanitized, expected)
}
func testSingleZalgo() {
let string = "x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝"
let sanitizer = StringSanitizer(string)
XCTAssertTrue(sanitizer.needsSanitization)
let expected = "<EFBFBD>"
XCTAssertEqual(sanitizer.sanitized, expected)
}
func testTwoZalgo() {
let string = "x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝"
let sanitizer = StringSanitizer(string)
XCTAssertTrue(sanitizer.needsSanitization)
let expected = "<EFBFBD><EFBFBD>"
XCTAssertEqual(sanitizer.sanitized, expected)
}
}