From 70b3003344dd7dc4d815406d955ac48857d89c55 Mon Sep 17 00:00:00 2001 From: Faisal Salman Date: Sat, 16 Nov 2024 22:14:14 +0700 Subject: [PATCH] [submodule:helpers] Add new method `isAIBot()`: detect AI bots --- README.md | 10 +++- src/extensions/ua-parser-extensions.js | 8 +-- src/helpers/ua-parser-helpers.d.ts | 2 + src/helpers/ua-parser-helpers.js | 72 ++++++++++++++++++++++++++ test/mocha-test-helpers.js | 16 +++++- 5 files changed, 102 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7b0aa5fa8..aa7e1c0b7 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,15 @@ see what's new & breaking. ✅ - Extras (Apps, Libs, Emails, Media Players, etc) + AI Bot detection + ❌ + ✅ + ✅ + ✅ + ✅ + + + Extras (Apps, Libs, Emails, Media Players, etc) detection ❌ ✅ ✅ diff --git a/src/extensions/ua-parser-extensions.js b/src/extensions/ua-parser-extensions.js index bd9d517d6..5ed4f758b 100644 --- a/src/extensions/ua-parser-extensions.js +++ b/src/extensions/ua-parser-extensions.js @@ -90,8 +90,8 @@ const Crawlers = Object.freeze({ // Yeti (Naver) /(yeti)\/([\w\.]+)/i, - // aiHitBot / Cohere-AI / Diffbot / Magpie-Crawler / Omgilibot / Webzio-Extended / Screaming Frog SEO Spider / Timpibot / VelenPublicWebCrawler / YisouSpider / YouBot - /((?:aihit|diff|timpi|you)bot|cohere-ai|omgili(?:bot)?|(?:magpie-|velenpublicweb)crawler|webzio-extended|(?:screaming frog seo |yisou)spider)\/?([\w\.]*)/i + // aiHitBot / Diffbot / Magpie-Crawler / Omgilibot / Webzio-Extended / Screaming Frog SEO Spider / Timpibot / VelenPublicWebCrawler / YisouSpider / YouBot + /((?:aihit|diff|timpi|you)bot|omgili(?:bot)?|(?:magpie-|velenpublicweb)crawler|webzio-extended|(?:screaming frog seo |yisou)spider)\/?([\w\.]*)/i ], [NAME, VERSION, [TYPE, CRAWLER]], @@ -241,8 +241,8 @@ const Fetchers = Object.freeze({ ], [NAME, VERSION, [TYPE, FETCHER]], - // Google Bots / Snapchat / Vercelbot - [/(vercelbot|feedfetcher-google|google(?:-read-aloud|producer)|(?=bot; )snapchat)/i], + // Google Bots / Cohere / Snapchat / Vercelbot + [/(cohere-ai|vercelbot|feedfetcher-google|google(?:-read-aloud|producer)|(?=bot; )snapchat)/i], [NAME, [TYPE, FETCHER]], ] }); diff --git a/src/helpers/ua-parser-helpers.d.ts b/src/helpers/ua-parser-helpers.d.ts index 0e6147761..4564a23c2 100644 --- a/src/helpers/ua-parser-helpers.d.ts +++ b/src/helpers/ua-parser-helpers.d.ts @@ -6,6 +6,7 @@ import { IResult } from "../main/ua-parser"; declare function getDeviceVendor(model: string): string | undefined; declare function isAppleSilicon(resultOrUA: IResult | string): boolean; +declare function isAIBot(resultOrUA: IResult | string): boolean; declare function isBot(resultOrUA: IResult | string): boolean; declare function isChromeFamily(resultOrUA: IResult | string): boolean; declare function isElectron(): boolean; @@ -16,6 +17,7 @@ declare function isStandalonePWA(): boolean; export { getDeviceVendor, isAppleSilicon, + isAIBot, isBot, isChromeFamily, isElectron, diff --git a/src/helpers/ua-parser-helpers.js b/src/helpers/ua-parser-helpers.js index f3f19c80f..17946ee93 100644 --- a/src/helpers/ua-parser-helpers.js +++ b/src/helpers/ua-parser-helpers.js @@ -41,6 +41,77 @@ const isAppleSilicon = (resultOrUA) => { return false; } +const isAIBot = (resultOrUA) => [ + + // AI2 + 'ai2bot', + + // Amazon + 'amazonbot', + + // Anthropic + 'anthropic-ai', + 'claude-web', + 'claudebot', + + // Apple + 'applebot', + 'applebot-extended', + + // ByteDance + 'bytespider', + + // Common Crawl + 'ccbot', + + // DataForSeo + 'dataforseobot', + + // Diffbot + 'diffbot', + + // Google + 'googleother', + 'googleother-image', + 'googleother-video', + 'google-extended', + + // Hive AI + 'imagesiftbot', + + // Huawei + 'petalbot', + + // Meta + 'facebookbot', + 'meta-externalagent', + + // OpenAI + 'gptbot', + 'oai-searchbot', + + // Perplexity + 'perplexitybot', + + // Timpi + 'timpibot', + + // Velen.io + 'velenpublicwebcrawler', + + // Webz.io + 'omgili', + 'omgilibot', + 'webzio-extended', + + // You.com + 'youbot', + + // Zyte + 'scrapy' + + ].includes(String(toResult(resultOrUA, Bots).browser.name).toLowerCase()); + const isBot = (resultOrUA) => [ 'cli', 'crawler', @@ -56,6 +127,7 @@ const isElectron = () => !!(process?.versions?.hasOwnProperty('electron') || module.exports = { getDeviceVendor, isAppleSilicon, + isAIBot, isBot, isChromeFamily, isElectron, diff --git a/test/mocha-test-helpers.js b/test/mocha-test-helpers.js index a2f25cdc2..d6170bcb5 100644 --- a/test/mocha-test-helpers.js +++ b/test/mocha-test-helpers.js @@ -1,6 +1,6 @@ const assert = require('assert'); const { UAParser } = require('../src/main/ua-parser'); -const { getDeviceVendor, isAppleSilicon, isBot, isChromeFamily } = require('../src/helpers/ua-parser-helpers'); +const { getDeviceVendor, isAppleSilicon, isAIBot, isBot, isChromeFamily } = require('../src/helpers/ua-parser-helpers'); const { Bots, Emails } = require('../src/extensions/ua-parser-extensions'); describe('getDeviceVendor', () => { @@ -34,6 +34,20 @@ describe('isAppleSilicon', () => { }); }); +describe('isAIBot', () => { + it('Can detect AI Bots', () => { + + const claudeBot = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)'; + const firefox = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0'; + const searchGPT = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot'; + + assert.equal(isAIBot(UAParser(claudeBot, Bots)), true); + assert.equal(isAIBot(claudeBot), true); + assert.equal(isAIBot(firefox), false); + assert.equal(isAIBot(searchGPT), true); + }); +}); + describe('isBot', () => { it('Can detect Bots', () => {