From 4410f20562ab37a15352bfbcdae947dd91aae96b Mon Sep 17 00:00:00 2001
From: Dragory <2606411+Dragory@users.noreply.github.com>
Date: Wed, 28 Apr 2021 21:12:19 +0300
Subject: [PATCH] Improve invite detection

Several less common patterns are now detected:
discord.gg/anything/here/<code>
discord.com/invite/<code>/anything/here
discordapp.com/invite/<code>/anything/here

Potential invite URLs are also parsed as URLs to clean out any
shenanigans related to valid-but-uncommon URL formats.
---
 backend/src/utils.ts | 62 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/backend/src/utils.ts b/backend/src/utils.ts
index 98ec406f..059ddac9 100644
--- a/backend/src/utils.ts
+++ b/backend/src/utils.ts
@@ -20,7 +20,7 @@ import {
   TextChannel,
   User,
 } from "eris";
-import url from "url";
+import { URL } from "url";
 import tlds from "tlds";
 import emojiRegex from "emoji-regex";
 import * as t from "io-ts";
@@ -481,7 +481,7 @@ const plainLinkRegex = /((?!https?:\/\/)\S)+\.\S+/; // anything.anything, withou
 const urlRegex = new RegExp(`(${realLinkRegex.source}|${plainLinkRegex.source})`, "g");
 const protocolRegex = /^[a-z]+:\/\//;
 
-interface MatchedURL extends url.URL {
+interface MatchedURL extends URL {
   input: string;
 }
 
@@ -496,7 +496,7 @@ export function getUrlsInString(str: string, onlyUnique = false): MatchedURL[] {
 
     let matchUrl: MatchedURL;
     try {
-      matchUrl = new url.URL(withProtocol) as MatchedURL;
+      matchUrl = new URL(withProtocol) as MatchedURL;
       matchUrl.input = match;
     } catch (e) {
       return urls;
@@ -520,9 +520,61 @@ export function parseInviteCodeInput(str: string): string {
   return getInviteCodesInString(str)[0];
 }
 
+export function isNotNull(value): value is Exclude<typeof value, null> {
+  return value != null;
+}
+
+// discord.com/invite/<code>
+// discordapp.com/invite/<code>
+// discord.gg/invite/<code>
+// discord.gg/<code>
+const quickInviteDetection = /(?:discord.com|discordapp.com)\/invite\/([^\s\/#?]+)|discord.gg\/(?:\S+\/)?([^\s\/#?]+)/gi;
+
+const isInviteHostRegex = /(?:^|\.)(?:discord.gg|discord.com|discordapp.com)$/;
+const longInvitePathRegex = /^\/invite\/([^\s\/]+)$/;
+
 export function getInviteCodesInString(str: string): string[] {
-  const inviteCodeRegex = /(?:discord.gg|discordapp.com\/invite|discord.com\/invite)\/([a-z0-9\-]+)/gi;
-  return Array.from(str.matchAll(inviteCodeRegex)).map(m => m[1]);
+  const inviteCodes: string[] = [];
+
+  // Clean up markdown
+  str = str.replace(/[|*_~]/g, "");
+
+  // Quick detection
+  const quickDetectionMatch = str.matchAll(quickInviteDetection);
+  if (quickDetectionMatch) {
+    inviteCodes.push(...[...quickDetectionMatch].map(m => m[1] || m[2]));
+  }
+
+  // Deep detection via URL parsing
+  const linksInString = getUrlsInString(str, true);
+  const potentialInviteLinks = linksInString.filter(url => isInviteHostRegex.test(url.hostname));
+  const withNormalizedPaths = potentialInviteLinks.map(url => {
+    url.pathname = url.pathname.replace(/\/{2,}/g, "/").replace(/\/+$/g, "");
+    return url;
+  });
+
+  const codesFromInviteLinks = withNormalizedPaths
+    .map(url => {
+      // discord.gg/[anything/]<code>
+      if (url.hostname === "discord.gg") {
+        const parts = url.pathname.split("/").filter(Boolean);
+        return parts[parts.length - 1];
+      }
+
+      // discord.com/invite/<code>[/anything]
+      // discordapp.com/invite/<code>[/anything]
+      const longInviteMatch = url.pathname.match(longInvitePathRegex);
+      if (longInviteMatch) {
+        return longInviteMatch[1];
+      }
+
+      return null;
+    })
+    .filter(Boolean) as string[];
+
+  inviteCodes.push(...codesFromInviteLinks);
+
+  return unique(inviteCodes);
 }
 
 export const unicodeEmojiRegex = emojiRegex();