Change how excessive diacriticals are removed.
Fixes a bug where some emoji were mistaken for zalgo, causing all diacritics in a message to get stripped. Instead, replace "risky" characters with too many diacritics with the unicode replacement character, leaving others alone.
This commit is contained in:
parent
603f75807e
commit
2f866beb00
@ -40,7 +40,8 @@ NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
#pragma mark -
|
||||
|
||||
static void *kNSString_SSK_hasExcessiveDiacriticals = &kNSString_SSK_hasExcessiveDiacriticals;
|
||||
static void *kNSString_SSK_needsSanitization = &kNSString_SSK_needsSanitization;
|
||||
static void *kNSString_SSK_sanitizedCounterpart = &kNSString_SSK_sanitizedCounterpart;
|
||||
static unichar bidiLeftToRightIsolate = 0x2066;
|
||||
static unichar bidiRightToLeftIsolate = 0x2067;
|
||||
static unichar bidiFirstStrongIsolate = 0x2068;
|
||||
@ -230,7 +231,7 @@ static unichar bidiPopDirectionalIsolate = 0x2069;
|
||||
- (NSString *)filterSubstringForDisplay
|
||||
{
|
||||
// We don't want to strip a substring before filtering.
|
||||
return self.filterForIndicScripts.filterForExcessiveDiacriticals.ensureBalancedBidiControlCharacters;
|
||||
return self.filterForIndicScripts.sanitized.ensureBalancedBidiControlCharacters;
|
||||
}
|
||||
|
||||
- (NSString *)filterStringForDisplay
|
||||
@ -240,7 +241,7 @@ static unichar bidiPopDirectionalIsolate = 0x2069;
|
||||
|
||||
- (NSString *)filterFilename
|
||||
{
|
||||
return self.ows_stripped.filterForIndicScripts.filterForExcessiveDiacriticals.filterUnsafeFilenameCharacters;
|
||||
return self.ows_stripped.filterForIndicScripts.sanitized.filterUnsafeFilenameCharacters;
|
||||
}
|
||||
|
||||
- (NSString *)withoutBidiControlCharacters
|
||||
@ -331,47 +332,26 @@ static unichar bidiPopDirectionalIsolate = 0x2069;
|
||||
return [NSString stringWithFormat:@"%C%@%C", bidiFirstStrongIsolate, self.ensureBalancedBidiControlCharacters, bidiPopDirectionalIsolate];
|
||||
}
|
||||
|
||||
- (NSString *)filterForExcessiveDiacriticals
|
||||
- (NSString *)sanitized
|
||||
{
|
||||
if (!self.hasExcessiveDiacriticals) {
|
||||
NSNumber *cachedNeedsSanitization = objc_getAssociatedObject(self, kNSString_SSK_needsSanitization);
|
||||
if (cachedNeedsSanitization != nil) {
|
||||
if (cachedNeedsSanitization.boolValue) {
|
||||
return objc_getAssociatedObject(self, kNSString_SSK_sanitizedCounterpart) ?: self;
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
|
||||
StringSanitizer *sanitizer = [[StringSanitizer alloc] initWithString:self];
|
||||
const BOOL needsSanitization = sanitizer.needsSanitization;
|
||||
objc_setAssociatedObject(self, kNSString_SSK_needsSanitization, @(needsSanitization), OBJC_ASSOCIATION_COPY);
|
||||
if (!needsSanitization) {
|
||||
return self;
|
||||
}
|
||||
return [self stringByFoldingWithOptions:NSDiacriticInsensitiveSearch locale:[NSLocale currentLocale]];
|
||||
}
|
||||
|
||||
- (BOOL)hasExcessiveDiacriticals
|
||||
{
|
||||
NSNumber *cachedValue = objc_getAssociatedObject(self, kNSString_SSK_hasExcessiveDiacriticals);
|
||||
if (!cachedValue) {
|
||||
cachedValue = @([self computeHasExcessiveDiacriticals]);
|
||||
objc_setAssociatedObject(self, kNSString_SSK_hasExcessiveDiacriticals, cachedValue, OBJC_ASSOCIATION_COPY);
|
||||
}
|
||||
|
||||
return cachedValue.boolValue;
|
||||
}
|
||||
|
||||
- (BOOL)computeHasExcessiveDiacriticals
|
||||
{
|
||||
// discard any zalgo style text, by detecting maximum number of glyphs per character
|
||||
NSUInteger index = 0;
|
||||
|
||||
// store in local var, it's a hot code path.
|
||||
NSUInteger length = self.length;
|
||||
while (index < length) {
|
||||
// Walk the grapheme clusters in the string.
|
||||
NSRange range = [self rangeOfComposedCharacterSequenceAtIndex:index];
|
||||
if (range.length > 8) {
|
||||
// There are too many characters in this grapheme cluster.
|
||||
return YES;
|
||||
} else if (range.location != index || range.length < 1) {
|
||||
// This should never happen.
|
||||
OWSFailDebug(
|
||||
@"unexpected composed character sequence: %lu, %@", (unsigned long)index, NSStringFromRange(range));
|
||||
return YES;
|
||||
}
|
||||
index = range.location + range.length;
|
||||
}
|
||||
return NO;
|
||||
NSString *sanitized = sanitizer.sanitized;
|
||||
objc_setAssociatedObject(self, kNSString_SSK_sanitizedCounterpart, sanitized, OBJC_ASSOCIATION_COPY);
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
+ (NSRegularExpression *)anyASCIIRegex
|
||||
|
||||
58
SignalCoreKit/src/StringSanitizer.swift
Normal file
58
SignalCoreKit/src/StringSanitizer.swift
Normal file
@ -0,0 +1,58 @@
|
||||
//
|
||||
// Copyright (c) 2022 Open Whisper Systems. All rights reserved.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
/// Replaces extended grapheme clusters having too many combining marks with the unicode replacement character.
|
||||
///
|
||||
/// Example usage:
|
||||
/// ```
|
||||
/// let sanitizer = StringSanitizer("Jack said, “H̴̬̪̤̗̪̳̑̓e̵̱̗͇̰̽̊͛̿̒̚͠r̶̨̯̻̹̪̫̣̪̹͇̗̀͌̃̍̄͗̎͊͌ę̶̣͍̗̘̺̪̱̇̈́̈́͗͌̀̊̏ͅ'̷̧̧̭̜̱̜͉̟͇̣̉̃ͅs̸̪̻̯͔̤̣̱̾̽̌̇̃̒͋͂̈́̀͌̍̚ ̶͙́̓͊̈́̉̂͗̆͗̑͂̕J̵̨̧̧̠̩͈̹͈̦̩̣͙͐̿̇̈́̓ͅͅo̵̡̥̪͘h̵̡̧̢̘̟͓͖̤̼̟̺͓̰͈͓̎͋̎͝ņ̶̛͖̻̻̝͗̃͋͠n̶̮͈̯̩̘̠̻͔̈̌̐͘̚͝y̵̧̡̛͙͈̹̹̹̗̤̙͖̜̰̰͌͆̏̑͐̽̍͜!̸̡͈͔͆”)
|
||||
/// if (sanitizer.needsSanitization) {
|
||||
/// print(sanitizer.sanitized); // Jack said, “<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>”
|
||||
/// }
|
||||
/// ```
|
||||
@objc class StringSanitizer: NSObject {
|
||||
private static let maxCodePoints = 16
|
||||
private let string: String
|
||||
|
||||
@objc(initWithString:)
|
||||
init(_ string: String) {
|
||||
self.string = string
|
||||
}
|
||||
|
||||
/// Indicates if the string needs to be modified. This is slightly cheaper than calling `sanitized`.
|
||||
@objc lazy var needsSanitization: Bool = {
|
||||
return string.contains {
|
||||
$0.unicodeScalars.count > Self.maxCodePoints
|
||||
}
|
||||
}()
|
||||
|
||||
/// Returns a modified version of the string if sanitization is needed, or the original string otherwise.
|
||||
@objc lazy var sanitized: String = {
|
||||
if !needsSanitization {
|
||||
return string
|
||||
}
|
||||
precondition(!string.isEmpty)
|
||||
return sanitize(string)
|
||||
}()
|
||||
|
||||
private func isBad(_ c: Character) -> Bool {
|
||||
return c.unicodeScalars.count > Self.maxCodePoints
|
||||
}
|
||||
|
||||
private func sanitize(_ original: String) -> String {
|
||||
var remaining = original[...]
|
||||
var result = ""
|
||||
// An overestimate, because we will shorten at least one Character.
|
||||
result.reserveCapacity(original.utf8.count)
|
||||
while let nextBadCharIndex = remaining.firstIndex(where: isBad) {
|
||||
result.append(contentsOf: remaining[..<nextBadCharIndex])
|
||||
result.append("\u{FFFD}")
|
||||
remaining = remaining[nextBadCharIndex...].dropFirst()
|
||||
}
|
||||
result.append(contentsOf: remaining)
|
||||
return result
|
||||
}
|
||||
}
|
||||
60
SignalCoreKitTests/src/StringSanitizerTests.swift
Normal file
60
SignalCoreKitTests/src/StringSanitizerTests.swift
Normal file
@ -0,0 +1,60 @@
|
||||
//
|
||||
// Copyright (c) 2022 Open Whisper Systems. All rights reserved.
|
||||
//
|
||||
|
||||
import XCTest
|
||||
@testable import SignalCoreKit
|
||||
|
||||
class StringSanitizerTests: XCTestCase {
|
||||
func testEmpty() {
|
||||
let string = ""
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertFalse(sanitizer.needsSanitization)
|
||||
XCTAssertEqual(sanitizer.sanitized, string)
|
||||
}
|
||||
|
||||
func testASCII() {
|
||||
let string = "abc"
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertFalse(sanitizer.needsSanitization)
|
||||
XCTAssertEqual(sanitizer.sanitized, string)
|
||||
}
|
||||
|
||||
func testCombiningMarks() {
|
||||
let string = "abx̧c"
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertFalse(sanitizer.needsSanitization)
|
||||
XCTAssertEqual(sanitizer.sanitized, string)
|
||||
}
|
||||
|
||||
func testEmoji() {
|
||||
let string = "a👩🏿❤️💋👩🏻b"
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertFalse(sanitizer.needsSanitization)
|
||||
XCTAssertEqual(sanitizer.sanitized, string)
|
||||
}
|
||||
|
||||
func testZalgo() {
|
||||
let string = "x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝abx̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝👩🏿❤️💋👩🏻cx̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝"
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertTrue(sanitizer.needsSanitization)
|
||||
let expected = "<EFBFBD>ab<EFBFBD><EFBFBD>👩🏿❤️💋👩🏻c<EFBFBD>"
|
||||
XCTAssertEqual(sanitizer.sanitized, expected)
|
||||
}
|
||||
|
||||
func testSingleZalgo() {
|
||||
let string = "x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝"
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertTrue(sanitizer.needsSanitization)
|
||||
let expected = "<EFBFBD>"
|
||||
XCTAssertEqual(sanitizer.sanitized, expected)
|
||||
}
|
||||
|
||||
func testTwoZalgo() {
|
||||
let string = "x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝x̸̢̧̛̙̝͈͈̖̳̗̰̆̈́̆̿̈́̅̽͆̈́̿̔͌̚͝"
|
||||
let sanitizer = StringSanitizer(string)
|
||||
XCTAssertTrue(sanitizer.needsSanitization)
|
||||
let expected = "<EFBFBD><EFBFBD>"
|
||||
XCTAssertEqual(sanitizer.sanitized, expected)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user