// // Copyright 2018 Signal Messenger, LLC // SPDX-License-Identifier: AGPL-3.0-only // import GRDB public enum FullTextSearchIndexer { public static let matchTag = "match" // MARK: - Normalization private static var charactersToRemove: CharacterSet = { // * We want to strip punctuation - and our definition of "punctuation" // is broader than `CharacterSet.punctuationCharacters`. // * FTS should be robust to (i.e. ignore) illegal and control characters, // but it's safer if we filter them ourselves as well. var charactersToFilter = CharacterSet.punctuationCharacters charactersToFilter.formUnion(CharacterSet.illegalCharacters) charactersToFilter.formUnion(CharacterSet.controlCharacters) // We want to strip all ASCII characters except: // * Letters a-z, A-Z // * Numerals 0-9 // * Whitespace var asciiToFilter = CharacterSet(charactersIn: UnicodeScalar(0x0)!.. String { // 1. Filter out invalid characters. let filtered = text.removeCharacters(characterSet: charactersToRemove) // 2. Simplify whitespace. let simplified = filtered.replaceCharacters( characterSet: .whitespacesAndNewlines, replacement: " ", ) // 3. Strip leading & trailing whitespace last, since we may replace // filtered characters with whitespace. let trimmed = simplified.trimmingCharacters(in: .whitespacesAndNewlines) // 4. Use canonical mapping. // // From the GRDB docs: // // Generally speaking, matches may fail when content and query don’t use // the same unicode normalization. SQLite actually exhibits inconsistent // behavior in this regard. // // For example, for aimé to match aimé, they better have the same // normalization: the NFC aim\u{00E9} form may not match its NFD aime\u{0301} // equivalent. Most strings that you get from Swift, UIKit and Cocoa use NFC, // so be careful with NFD inputs (such as strings from the HFS+ file system, // or strings that you can’t trust like network inputs). Use // String.precomposedStringWithCanonicalMapping to turn a string into NFC. // // Besides, if you want fi to match the ligature fi (U+FB01), then you need // to normalize your indexed contents and inputs to NFKC or NFKD. Use // String.precomposedStringWithCompatibilityMapping to turn a string into NFKC. let canonical = trimmed.precomposedStringWithCanonicalMapping return canonical } // MARK: - Querying // We want to match by prefix for "search as you type" functionality. // SQLite does not support suffix or contains matches. public static func buildQuery(for searchText: String) -> String { // 1. Normalize the search text. // // TODO: We could arguably convert to lowercase since the search // is case-insensitive. let normalizedSearchText = normalizeText(searchText) // 2. Split the non-numeric text into query terms (or tokens). let nonNumericText = String(String.UnicodeScalarView(normalizedSearchText.unicodeScalars.lazy.map { if CharacterSet.decimalDigits.contains($0) { return " " } else { return $0 } })) var queryTerms = nonNumericText.split(separator: " ") // 3. Add an additional numeric-only query term. let digitsOnlyScalars = normalizedSearchText.unicodeScalars.lazy.filter { CharacterSet.decimalDigits.contains($0) } let digitsOnly: Substring = Substring(String(String.UnicodeScalarView(digitsOnlyScalars))) queryTerms.append(digitsOnly) // 4. De-duplicate and sort query terms. // Duplicate terms are redundant. // Sorting terms makes the output of this method deterministic and easier to test, // and the order won't affect the search results. queryTerms = Array(Set(queryTerms)).sorted() // 5. Filter the query terms. let filteredQueryTerms = queryTerms.filter { // Ignore empty terms. $0.count > 0 }.map { // Allow partial match of each term. // // Note that we use double-quotes to enclose each search term. // Quoted search terms can include a few more characters than // "bareword" (non-quoted) search terms. This shouldn't matter, // since we're filtering all of the affected characters, but // quoting protects us from any bugs in that logic. "\"\($0)\"*" } // 6. Join terms into query string. let query = filteredQueryTerms.joined(separator: " ") return query } } // MARK: - Message Index // See: http://groue.github.io/GRDB.swift/docs/4.1/index.html#full-text-search // See: https://www.sqlite.org/fts5.html extension FullTextSearchIndexer { public static let ftsTableName = "indexable_text_fts" static let contentTableName = "indexable_text" static let uniqueIdColumn = "uniqueId" static let collectionColumn = "collection" static let ftsContentColumn = "ftsIndexableContent" private static let legacyCollectionName = "TSInteraction" private static func indexableContent(for message: TSMessage, tx: DBReadTransaction) -> String? { guard !message.isViewOnceMessage else { // Don't index "view-once messages". return nil } guard !message.isGroupStoryReply else { return nil } guard message.editState != .pastRevision else { return nil } guard let bodyText = message.rawBody(transaction: tx) else { return nil } return normalizeText(bodyText) } public static func insert( _ message: TSMessage, tx: DBWriteTransaction, ) { guard let ftsContent = indexableContent(for: message, tx: tx) else { return } executeUpdate( sql: """ INSERT INTO \(contentTableName) (\(collectionColumn), \(uniqueIdColumn), \(ftsContentColumn)) VALUES (?, ?, ?) """, arguments: [legacyCollectionName, message.uniqueId, ftsContent], tx: tx, ) } public static func update( _ message: TSMessage, tx: DBWriteTransaction, ) { delete(message, tx: tx) insert(message, tx: tx) } public static func delete( _ message: TSMessage, tx: DBWriteTransaction, ) { executeUpdate( sql: """ DELETE FROM \(contentTableName) WHERE \(uniqueIdColumn) == ? AND \(collectionColumn) == ? """, arguments: [message.uniqueId, legacyCollectionName], tx: tx, ) } private static func executeUpdate( sql: String, arguments: StatementArguments, tx: DBWriteTransaction, ) { do { // Worth using a cached statement here, as we may be indexing many // things at once. try tx.database.executeWithCachedStatementThrows( sql: sql, arguments: arguments, ) } catch { // We intentionally don't use failIfThrows here because we know the // FTS index relatively frequently reports corruption errors; for // these specifically swallow them rather than flagging the entire // database as corrupted. // // TODO: Implement "rebuild the FTS index" in response to it becoming corrupted. owsFailDebug("Failed to perform FTS index operation! \(error.grdbErrorForLogging)") } } // MARK: - Querying public static func search( for searchText: String, maxResults: Int, tx: DBReadTransaction, block: (_ message: TSMessage, _ snippet: String, _ stop: inout Bool) -> Void, ) { let query = buildQuery(for: searchText) if query.isEmpty { // FullTextSearchFinder.query filters some characters, so query // may now be empty. Logger.warn("Empty query.") return } // Search with the query interface or SQL do { // GRDB TODO: We could use bm25() instead of rank to order results. let indexOfContentColumnInFTSTable = 0 // Determines the length of the snippet. let numTokens: UInt = 15 let matchSnippet = "match_snippet" let sql: String = """ SELECT \(contentTableName).\(collectionColumn), \(contentTableName).\(uniqueIdColumn), SNIPPET(\(ftsTableName), \(indexOfContentColumnInFTSTable), '<\(matchTag)>', '', '…', \(numTokens)) AS \(matchSnippet) FROM \(ftsTableName) LEFT JOIN \(contentTableName) ON \(contentTableName).rowId = \(ftsTableName).rowId WHERE \(ftsTableName).\(ftsContentColumn) MATCH ? ORDER BY rank LIMIT \(maxResults) """ let cursor = try Row.fetchCursor(tx.database, sql: sql, arguments: [query]) while let row = try cursor.next() { let collection: String = row[collectionColumn] guard collection == legacyCollectionName else { owsFailDebug("Found something other than a message in the FTS table") continue } guard let uniqueId = (row[uniqueIdColumn] as String).nilIfEmpty else { owsFailDebug("Found a message with a uniqueId in the FTS table") continue } let snippet: String = row[matchSnippet] guard let message = TSMessage.fetchMessageViaCache(uniqueId: uniqueId, transaction: tx) else { owsFailDebug("Couldn't find message that exists in the FTS table") continue } var stop = false block(message, snippet, &stop) if stop { break } } } catch { owsFailDebug("Couldn't fetch results: \(error.grdbErrorForLogging)") } } }