From b7943e69dba0b92aa8f34026b0dc9b979561bcbf Mon Sep 17 00:00:00 2001 From: "Codex, agent ChatGPT" Date: Thu, 12 Mar 2026 16:45:11 +0100 Subject: [PATCH] Harden verified bot detection --- README.md | 3 +- docs/configuration.md | 4 + internal/investigation/service.go | 134 ++++++++++++++++++++--- internal/investigation/service_test.go | 141 +++++++++++++++++++++++++ 4 files changed, 265 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 64affe1..d746c6b 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,8 @@ - The background worker only fills in missing investigations; it does not continuously re-check cached intelligence. - Opening an IP details page reuses the cached investigation. - `Refresh investigation` is the explicit action that forces a new lookup. -- Verified bot detection currently uses built-in provider logic for Google, Bing, Apple, Meta, and DuckDuckGo. +- Verified bot detection currently uses built-in provider logic for Google, Bing, Apple, Meta, DuckDuckGo, OpenAI, Perplexity, and Yandex. +- When an official crawler publishes IP ranges, the daemon prefers those ranges and can combine them with User-Agent verification when the provider documents distinct bot user agents. - When an address is not identified as a verified bot, the daemon can collect reverse DNS, forward-confirmed reverse DNS, RDAP registration details, and Spamhaus DNSBL status. ## Caddy log requirements diff --git a/docs/configuration.md b/docs/configuration.md index 9600eee..da0eeef 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -82,6 +82,10 @@ Current releases can collect: - RDAP registration details such as network name, organization, country, prefix, and abuse contact - Spamhaus listed or not listed status +Built-in verified bot providers currently cover Google, Bing, Apple, Meta, DuckDuckGo, OpenAI, Perplexity, and Yandex. + +When a provider publishes official crawler ranges, the daemon uses those published ranges as the source of truth and can also require a matching User-Agent token for provider families that expose several distinct crawlers. + ## `opnsense` Controls the optional firewall backend. diff --git a/internal/investigation/service.go b/internal/investigation/service.go index 73dd237..74ba041 100644 --- a/internal/investigation/service.go +++ b/internal/investigation/service.go @@ -1,8 +1,9 @@ package investigation import ( + "bufio" + "bytes" "context" - "encoding/csv" "encoding/hex" "encoding/json" "errors" @@ -293,17 +294,14 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin } return networks, nil case "geofeed_csv": - reader := csv.NewReader(strings.NewReader(string(payload))) - rows, err := reader.ReadAll() - if err != nil { - return nil, fmt.Errorf("decode geofeed payload from %s: %w", sourceURL, err) - } - networks := make([]netip.Prefix, 0, len(rows)) - for _, row := range rows { - if len(row) == 0 { + scanner := bufio.NewScanner(bytes.NewReader(payload)) + networks := make([]netip.Prefix, 0, 64) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { continue } - candidate := strings.TrimSpace(row[0]) + candidate := strings.TrimSpace(strings.SplitN(line, ",", 2)[0]) if candidate == "" || strings.HasPrefix(candidate, "#") { continue } @@ -313,6 +311,9 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin } networks = append(networks, prefix.Masked()) } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("scan geofeed payload from %s: %w", sourceURL, err) + } return networks, nil default: return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL) @@ -533,6 +534,7 @@ func defaultBotProviders() []botProvider { Icon: "🤖", SourceFormat: "json_prefixes", CacheTTL: 24 * time.Hour, + IPRangeURLs: []string{"https://www.bing.com/toolbox/bingbot.json"}, ReverseDNSSuffixes: []string{".search.msn.com"}, }, { @@ -552,11 +554,11 @@ func defaultBotProviders() []botProvider { CacheTTL: 24 * time.Hour, IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"}, UserAgentPrefixes: []string{ - "facebookexternalhit/", - "meta-webindexer/", - "meta-externalads/", - "meta-externalagent/", - "meta-externalfetcher/", + "facebookexternalhit", + "meta-webindexer", + "meta-externalads", + "meta-externalagent", + "meta-externalfetcher", }, }, { @@ -567,6 +569,68 @@ func defaultBotProviders() []botProvider { CacheTTL: 24 * time.Hour, IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"}, }, + { + ID: "openai_gptbot_official", + Name: "GPTBot", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: 24 * time.Hour, + IPRangeURLs: []string{"https://openai.com/gptbot.json"}, + UserAgentPrefixes: []string{ + "gptbot", + }, + }, + { + ID: "openai_chatgpt_user_official", + Name: "ChatGPT-User", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: 24 * time.Hour, + IPRangeURLs: []string{"https://openai.com/chatgpt-user.json"}, + UserAgentPrefixes: []string{ + "chatgpt-user", + }, + }, + { + ID: "openai_oai_searchbot_official", + Name: "OAI-SearchBot", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: 24 * time.Hour, + IPRangeURLs: []string{"https://openai.com/searchbot.json"}, + UserAgentPrefixes: []string{ + "oai-searchbot", + }, + }, + { + ID: "perplexitybot_official", + Name: "PerplexityBot", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: 24 * time.Hour, + IPRangeURLs: []string{"https://www.perplexity.com/perplexitybot.json"}, + UserAgentPrefixes: []string{ + "perplexitybot", + }, + }, + { + ID: "perplexity_user_official", + Name: "Perplexity-User", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: 24 * time.Hour, + IPRangeURLs: []string{"https://www.perplexity.com/perplexity-user.json"}, + UserAgentPrefixes: []string{ + "perplexity-user", + }, + }, + { + ID: "yandex_official", + Name: "YandexBot", + Icon: "🤖", + CacheTTL: 24 * time.Hour, + ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"}, + }, } } @@ -582,14 +646,49 @@ func ipMatchesPrefixes(ip netip.Addr, prefixes []netip.Prefix) bool { func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool { for _, agent := range userAgents { for _, prefix := range prefixes { - if strings.HasPrefix(agent, prefix) { + candidate := strings.ToLower(strings.TrimSpace(strings.TrimSuffix(prefix, "/"))) + if candidate == "" { + continue + } + if strings.HasPrefix(agent, candidate) { return true } + for _, token := range splitUserAgentTokens(agent) { + if token == candidate { + return true + } + } } } return false } +func splitUserAgentTokens(userAgent string) []string { + parts := strings.FieldsFunc(userAgent, func(value rune) bool { + switch value { + case ' ', ';', '(', ')', ',', '\t': + return true + default: + return false + } + }) + items := make([]string, 0, len(parts)) + seen := make(map[string]struct{}, len(parts)) + for _, part := range parts { + base := strings.TrimSpace(strings.SplitN(part, "/", 2)[0]) + if base == "" { + continue + } + normalized := strings.ToLower(base) + if _, ok := seen[normalized]; ok { + continue + } + seen[normalized] = struct{}{} + items = append(items, normalized) + } + return items +} + func normalizeUserAgents(userAgents []string) []string { items := make([]string, 0, len(userAgents)) for _, userAgent := range userAgents { @@ -634,6 +733,9 @@ func extractBotHintName(userAgent string) string { continue } normalized := strings.ToLower(base) + if strings.HasPrefix(normalized, "+") || strings.Contains(normalized, "@") { + continue + } if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") { return base } diff --git a/internal/investigation/service_test.go b/internal/investigation/service_test.go index 841f585..af48112 100644 --- a/internal/investigation/service_test.go +++ b/internal/investigation/service_test.go @@ -176,6 +176,147 @@ func TestInvestigateAddsBotHintFromUserAgent(t *testing.T) { } } +func TestParsePublishedNetworksSupportsCommentedGeofeed(t *testing.T) { + t.Parallel() + + prefixes, err := parsePublishedNetworks([]byte(strings.Join([]string{ + "# Publication date: Thu Mar 12 2026", + "31.13.78.0/24,NZ,NZ-AUK,Auckland,", + "2a03:2880:f061::/48,NZ,NZ-AUK,Auckland,", + "31.13.72.0/24,SE,,Bromma,", + "", + }, "\n")), "geofeed_csv", "https://example.test/geofeed") + if err != nil { + t.Fatalf("parse geofeed: %v", err) + } + if len(prefixes) != 3 { + t.Fatalf("expected 3 prefixes, got %d", len(prefixes)) + } + got := []string{prefixes[0].String(), prefixes[1].String(), prefixes[2].String()} + want := []string{"31.13.78.0/24", "2a03:2880:f061::/48", "31.13.72.0/24"} + if strings.Join(got, ",") != strings.Join(want, ",") { + t.Fatalf("unexpected geofeed prefixes: got %v want %v", got, want) + } +} + +func TestInvestigateRecognizesOpenAIBotViaEmbeddedUserAgentToken(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/gptbot.json" { + http.NotFound(w, r) + return + } + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/24"}]}`)) + })) + defer server.Close() + + svc := newService( + config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true}, + server.Client(), + &fakeResolver{}, + log.New(testWriter{t}, "", 0), + []botProvider{{ + ID: "openai_gptbot_official", + Name: "GPTBot", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: time.Hour, + IPRangeURLs: []string{server.URL + "/gptbot.json"}, + UserAgentPrefixes: []string{ + "gptbot", + }, + }}, + map[string]string{}, + ) + + investigation, err := svc.Investigate(context.Background(), "203.0.113.10", []string{"Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"}) + if err != nil { + t.Fatalf("investigate ip: %v", err) + } + if investigation.Bot == nil || investigation.Bot.Name != "GPTBot" || !investigation.Bot.Verified { + t.Fatalf("expected verified GPTBot match, got %+v", investigation.Bot) + } + if investigation.Bot.Method != "user_agent+published_ranges" { + t.Fatalf("expected combined method, got %+v", investigation.Bot) + } +} + +func TestInvestigateRecognizesPerplexityBotViaEmbeddedUserAgentToken(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/perplexitybot.json" { + http.NotFound(w, r) + return + } + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"198.51.100.0/24"}]}`)) + })) + defer server.Close() + + svc := newService( + config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true}, + server.Client(), + &fakeResolver{}, + log.New(testWriter{t}, "", 0), + []botProvider{{ + ID: "perplexitybot_official", + Name: "PerplexityBot", + Icon: "🤖", + SourceFormat: "json_prefixes", + CacheTTL: time.Hour, + IPRangeURLs: []string{server.URL + "/perplexitybot.json"}, + UserAgentPrefixes: []string{ + "perplexitybot", + }, + }}, + map[string]string{}, + ) + + investigation, err := svc.Investigate(context.Background(), "198.51.100.42", []string{"Mozilla/5.0 (compatible; PerplexityBot/1.0; +https://www.perplexity.ai/perplexitybot)"}) + if err != nil { + t.Fatalf("investigate ip: %v", err) + } + if investigation.Bot == nil || investigation.Bot.Name != "PerplexityBot" || !investigation.Bot.Verified { + t.Fatalf("expected verified PerplexityBot match, got %+v", investigation.Bot) + } +} + +func TestInvestigateRecognizesYandexViaReverseDNS(t *testing.T) { + t.Parallel() + + resolver := &fakeResolver{ + reverse: map[string][]string{"203.0.113.55": {"spider-55.search.yandex.ru."}}, + forward: map[string][]net.IPAddr{"spider-55.search.yandex.ru": {{IP: net.ParseIP("203.0.113.55")}}}, + } + + svc := newService( + config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true}, + http.DefaultClient, + resolver, + log.New(testWriter{t}, "", 0), + []botProvider{{ + ID: "yandex_official", + Name: "YandexBot", + Icon: "🤖", + CacheTTL: time.Hour, + ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"}, + }}, + map[string]string{}, + ) + + investigation, err := svc.Investigate(context.Background(), "203.0.113.55", []string{"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"}) + if err != nil { + t.Fatalf("investigate ip: %v", err) + } + if investigation.Bot == nil || investigation.Bot.Name != "YandexBot" || !investigation.Bot.Verified { + t.Fatalf("expected verified YandexBot match, got %+v", investigation.Bot) + } + if investigation.Bot.Method != "reverse_dns+fcrdns" { + t.Fatalf("expected reverse DNS verification, got %+v", investigation.Bot) + } +} + func TestPublishedNetworksAreCached(t *testing.T) { t.Parallel()