From 44f5b77cc7f1e284befc5d48b57737f53ee0be2d Mon Sep 17 00:00:00 2001 From: Dragory <2606411+Dragory@users.noreply.github.com> Date: Sun, 17 Oct 2021 08:03:38 +0300 Subject: [PATCH] perf(automod): also merge regexes in match_links, match_words --- .../plugins/Automod/triggers/matchLinks.ts | 42 +++++++++++++++--- .../plugins/Automod/triggers/matchRegex.ts | 20 +-------- .../plugins/Automod/triggers/matchWords.ts | 43 +++++++++++-------- backend/src/utils/mergeRegexes.ts | 17 ++++++++ backend/src/utils/mergeWordsIntoRegex.ts | 6 +++ 5 files changed, 85 insertions(+), 43 deletions(-) create mode 100644 backend/src/utils/mergeRegexes.ts create mode 100644 backend/src/utils/mergeWordsIntoRegex.ts diff --git a/backend/src/plugins/Automod/triggers/matchLinks.ts b/backend/src/plugins/Automod/triggers/matchLinks.ts index 2c70387c..31aa8006 100644 --- a/backend/src/plugins/Automod/triggers/matchLinks.ts +++ b/backend/src/plugins/Automod/triggers/matchLinks.ts @@ -7,12 +7,18 @@ import { TRegex } from "../../../validatorUtils"; import { getTextMatchPartialSummary } from "../functions/getTextMatchPartialSummary"; import { MatchableTextType, matchMultipleTextTypesOnMessage } from "../functions/matchMultipleTextTypesOnMessage"; import { automodTrigger } from "../helpers"; +import { mergeRegexes } from "../../../utils/mergeRegexes"; +import { mergeWordsIntoRegex } from "../../../utils/mergeWordsIntoRegex"; interface MatchResultType { type: MatchableTextType; link: string; } +const regexCache = new WeakMap(); + +const quickLinkCheck = /^https?:\/\//i; + export const MatchLinksTrigger = automodTrigger()({ configType: t.type({ include_domains: tNullable(t.array(t.string)), @@ -52,7 +58,7 @@ export const MatchLinksTrigger = automodTrigger()({ for (const link of links) { // "real link" = a link that Discord highlights - if (trigger.only_real_links && !link.input.match(/^https?:\/\//i)) { + if (trigger.only_real_links && !quickLinkCheck.test(link.input)) { continue; } @@ -62,7 +68,13 @@ export const MatchLinksTrigger = automodTrigger()({ // In order of specificity, regex > word > domain if (trigger.exclude_regex) { - for (const sourceRegex of trigger.exclude_regex) { + if (!regexCache.has(trigger.exclude_regex)) { + const toCache = mergeRegexes(trigger.exclude_regex, "i"); + regexCache.set(trigger.exclude_regex, toCache); + } + const regexes = regexCache.get(trigger.exclude_regex)!; + + for (const sourceRegex of regexes) { const matches = await pluginData.state.regexRunner.exec(sourceRegex, link.input).catch(allowTimeout); if (matches) { continue typeLoop; @@ -71,7 +83,13 @@ export const MatchLinksTrigger = automodTrigger()({ } if (trigger.include_regex) { - for (const sourceRegex of trigger.include_regex) { + if (!regexCache.has(trigger.include_regex)) { + const toCache = mergeRegexes(trigger.include_regex, "i"); + regexCache.set(trigger.include_regex, toCache); + } + const regexes = regexCache.get(trigger.include_regex)!; + + for (const sourceRegex of regexes) { const matches = await pluginData.state.regexRunner.exec(sourceRegex, link.input).catch(allowTimeout); if (matches) { return { extra: { type, link: link.input } }; @@ -80,8 +98,13 @@ export const MatchLinksTrigger = automodTrigger()({ } if (trigger.exclude_words) { - for (const word of trigger.exclude_words) { - const regex = new RegExp(escapeStringRegexp(word), "i"); + if (!regexCache.has(trigger.exclude_words)) { + const toCache = mergeWordsIntoRegex(trigger.exclude_words, "i"); + regexCache.set(trigger.exclude_words, [toCache]); + } + const regexes = regexCache.get(trigger.exclude_words)!; + + for (const regex of regexes) { if (regex.test(link.input)) { continue typeLoop; } @@ -89,8 +112,13 @@ export const MatchLinksTrigger = automodTrigger()({ } if (trigger.include_words) { - for (const word of trigger.include_words) { - const regex = new RegExp(escapeStringRegexp(word), "i"); + if (!regexCache.has(trigger.include_words)) { + const toCache = mergeWordsIntoRegex(trigger.include_words, "i"); + regexCache.set(trigger.include_words, [toCache]); + } + const regexes = regexCache.get(trigger.include_words)!; + + for (const regex of regexes) { if (regex.test(link.input)) { return { extra: { type, link: link.input } }; } diff --git a/backend/src/plugins/Automod/triggers/matchRegex.ts b/backend/src/plugins/Automod/triggers/matchRegex.ts index 0f9ac41b..0afc3c1f 100644 --- a/backend/src/plugins/Automod/triggers/matchRegex.ts +++ b/backend/src/plugins/Automod/triggers/matchRegex.ts @@ -1,4 +1,3 @@ -import { Util } from "discord.js"; import * as t from "io-ts"; import { allowTimeout } from "../../../RegExpRunner"; import { normalizeText } from "../../../utils/normalizeText"; @@ -7,7 +6,7 @@ import { TRegex } from "../../../validatorUtils"; import { getTextMatchPartialSummary } from "../functions/getTextMatchPartialSummary"; import { MatchableTextType, matchMultipleTextTypesOnMessage } from "../functions/matchMultipleTextTypesOnMessage"; import { automodTrigger } from "../helpers"; -import { categorize } from "../../../utils/categorize"; +import { mergeRegexes } from "../../../utils/mergeRegexes"; interface MatchResultType { pattern: string; @@ -15,21 +14,6 @@ interface MatchResultType { } const regexCache = new WeakMap(); -const hasBackreference = /(?:^|[^\\]|[\\]{2})\\\d+/; - -function buildCacheableRegexes(sourceRegexes: RegExp[], flags: string) { - const categories = categorize(sourceRegexes, { - hasBackreferences: (regex) => hasBackreference.exec(regex.source) !== null, - safeToMerge: () => true, - }); - const regexes: RegExp[] = []; - if (categories.safeToMerge.length) { - const merged = categories.safeToMerge.map((r) => `(?:${r.source})`).join("|"); - regexes.push(new RegExp(merged, flags)); - } - regexes.push(...categories.hasBackreferences); - return regexes; -} export const MatchRegexTrigger = automodTrigger()({ configType: t.type({ @@ -64,7 +48,7 @@ export const MatchRegexTrigger = automodTrigger()({ if (!regexCache.has(trigger)) { const flags = trigger.case_sensitive ? "" : "i"; - const toCache = buildCacheableRegexes(trigger.patterns, flags); + const toCache = mergeRegexes(trigger.patterns, flags); regexCache.set(trigger, toCache); } const regexes = regexCache.get(trigger)!; diff --git a/backend/src/plugins/Automod/triggers/matchWords.ts b/backend/src/plugins/Automod/triggers/matchWords.ts index fda24726..c7603496 100644 --- a/backend/src/plugins/Automod/triggers/matchWords.ts +++ b/backend/src/plugins/Automod/triggers/matchWords.ts @@ -12,6 +12,8 @@ interface MatchResultType { type: MatchableTextType; } +const regexCache = new WeakMap(); + export const MatchWordsTrigger = automodTrigger()({ configType: t.type({ words: t.array(t.string), @@ -49,6 +51,25 @@ export const MatchWordsTrigger = automodTrigger()({ return; } + if (!regexCache.has(trigger)) { + const looseMatchingThreshold = Math.min(Math.max(trigger.loose_matching_threshold, 1), 64); + const patterns = trigger.words.map((word) => { + let pattern = trigger.loose_matching + ? [...word].map((c) => escapeStringRegexp(c)).join(`(?:\\s*|.{0,${looseMatchingThreshold})`) + : escapeStringRegexp(word); + + if (trigger.only_full_words) { + pattern = `\\b${pattern}\\b`; + } + + return pattern; + }); + + const mergedRegex = new RegExp(patterns.map((p) => `(?:${p})`).join("|"), trigger.case_sensitive ? "" : "i"); + regexCache.set(trigger, [mergedRegex]); + } + const regexes = regexCache.get(trigger)!; + for await (let [type, str] of matchMultipleTextTypesOnMessage(pluginData, trigger, context.message)) { if (trigger.strip_markdown) { str = stripMarkdown(str); @@ -58,26 +79,12 @@ export const MatchWordsTrigger = automodTrigger()({ str = normalizeText(str); } - const looseMatchingThreshold = Math.min(Math.max(trigger.loose_matching_threshold, 1), 64); - - for (const word of trigger.words) { - // When performing loose matching, allow any amount of whitespace or up to looseMatchingThreshold number of other - // characters between the matched characters. E.g. if we're matching banana, a loose match could also match b a n a n a - let pattern = trigger.loose_matching - ? [...word].map((c) => escapeStringRegexp(c)).join(`(?:\\s*|.{0,${looseMatchingThreshold})`) - : escapeStringRegexp(word); - - if (trigger.only_full_words) { - pattern = `\\b${pattern}\\b`; - } - - const regex = new RegExp(pattern, trigger.case_sensitive ? "" : "i"); - const test = regex.test(str); - if (test) { + for (const regex of regexes) { + if (regex.test(str)) { return { extra: { - word, type, + word: "", }, }; } @@ -89,6 +96,6 @@ export const MatchWordsTrigger = automodTrigger()({ renderMatchInformation({ pluginData, contexts, matchResult }) { const partialSummary = getTextMatchPartialSummary(pluginData, matchResult.extra.type, contexts[0]); - return `Matched word \`${Util.escapeInlineCode(matchResult.extra.word)}\` in ${partialSummary}`; + return `Matched word in ${partialSummary}`; }, }); diff --git a/backend/src/utils/mergeRegexes.ts b/backend/src/utils/mergeRegexes.ts new file mode 100644 index 00000000..b503d12e --- /dev/null +++ b/backend/src/utils/mergeRegexes.ts @@ -0,0 +1,17 @@ +import { categorize } from "./categorize"; + +const hasBackreference = /(?:^|[^\\]|[\\]{2})\\\d+/; + +export function mergeRegexes(sourceRegexes: RegExp[], flags: string): RegExp[] { + const categories = categorize(sourceRegexes, { + hasBackreferences: (regex) => hasBackreference.exec(regex.source) !== null, + safeToMerge: () => true, + }); + const regexes: RegExp[] = []; + if (categories.safeToMerge.length) { + const merged = categories.safeToMerge.map((r) => `(?:${r.source})`).join("|"); + regexes.push(new RegExp(merged, flags)); + } + regexes.push(...categories.hasBackreferences); + return regexes; +} diff --git a/backend/src/utils/mergeWordsIntoRegex.ts b/backend/src/utils/mergeWordsIntoRegex.ts new file mode 100644 index 00000000..9019b44c --- /dev/null +++ b/backend/src/utils/mergeWordsIntoRegex.ts @@ -0,0 +1,6 @@ +import escapeStringRegexp from "escape-string-regexp"; + +export function mergeWordsIntoRegex(words: string[], flags?: string) { + const source = words.map((word) => `(?:${escapeStringRegexp(word)})`).join("|"); + return new RegExp(source, flags); +}