2

Harden verified bot detection

This commit is contained in:
2026-03-12 16:45:11 +01:00
parent 0bc2d2b689
commit b7943e69db
4 changed files with 265 additions and 17 deletions

View File

@@ -62,7 +62,8 @@
- The background worker only fills in missing investigations; it does not continuously re-check cached intelligence. - The background worker only fills in missing investigations; it does not continuously re-check cached intelligence.
- Opening an IP details page reuses the cached investigation. - Opening an IP details page reuses the cached investigation.
- `Refresh investigation` is the explicit action that forces a new lookup. - `Refresh investigation` is the explicit action that forces a new lookup.
- Verified bot detection currently uses built-in provider logic for Google, Bing, Apple, Meta, and DuckDuckGo. - Verified bot detection currently uses built-in provider logic for Google, Bing, Apple, Meta, DuckDuckGo, OpenAI, Perplexity, and Yandex.
- When an official crawler publishes IP ranges, the daemon prefers those ranges and can combine them with User-Agent verification when the provider documents distinct bot user agents.
- When an address is not identified as a verified bot, the daemon can collect reverse DNS, forward-confirmed reverse DNS, RDAP registration details, and Spamhaus DNSBL status. - When an address is not identified as a verified bot, the daemon can collect reverse DNS, forward-confirmed reverse DNS, RDAP registration details, and Spamhaus DNSBL status.
## Caddy log requirements ## Caddy log requirements

View File

@@ -82,6 +82,10 @@ Current releases can collect:
- RDAP registration details such as network name, organization, country, prefix, and abuse contact - RDAP registration details such as network name, organization, country, prefix, and abuse contact
- Spamhaus listed or not listed status - Spamhaus listed or not listed status
Built-in verified bot providers currently cover Google, Bing, Apple, Meta, DuckDuckGo, OpenAI, Perplexity, and Yandex.
When a provider publishes official crawler ranges, the daemon uses those published ranges as the source of truth and can also require a matching User-Agent token for provider families that expose several distinct crawlers.
## `opnsense` ## `opnsense`
Controls the optional firewall backend. Controls the optional firewall backend.

View File

@@ -1,8 +1,9 @@
package investigation package investigation
import ( import (
"bufio"
"bytes"
"context" "context"
"encoding/csv"
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
"errors" "errors"
@@ -293,17 +294,14 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
} }
return networks, nil return networks, nil
case "geofeed_csv": case "geofeed_csv":
reader := csv.NewReader(strings.NewReader(string(payload))) scanner := bufio.NewScanner(bytes.NewReader(payload))
rows, err := reader.ReadAll() networks := make([]netip.Prefix, 0, 64)
if err != nil { for scanner.Scan() {
return nil, fmt.Errorf("decode geofeed payload from %s: %w", sourceURL, err) line := strings.TrimSpace(scanner.Text())
} if line == "" || strings.HasPrefix(line, "#") {
networks := make([]netip.Prefix, 0, len(rows))
for _, row := range rows {
if len(row) == 0 {
continue continue
} }
candidate := strings.TrimSpace(row[0]) candidate := strings.TrimSpace(strings.SplitN(line, ",", 2)[0])
if candidate == "" || strings.HasPrefix(candidate, "#") { if candidate == "" || strings.HasPrefix(candidate, "#") {
continue continue
} }
@@ -313,6 +311,9 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
} }
networks = append(networks, prefix.Masked()) networks = append(networks, prefix.Masked())
} }
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("scan geofeed payload from %s: %w", sourceURL, err)
}
return networks, nil return networks, nil
default: default:
return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL) return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL)
@@ -533,6 +534,7 @@ func defaultBotProviders() []botProvider {
Icon: "🤖", Icon: "🤖",
SourceFormat: "json_prefixes", SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour, CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.bing.com/toolbox/bingbot.json"},
ReverseDNSSuffixes: []string{".search.msn.com"}, ReverseDNSSuffixes: []string{".search.msn.com"},
}, },
{ {
@@ -552,11 +554,11 @@ func defaultBotProviders() []botProvider {
CacheTTL: 24 * time.Hour, CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"}, IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"},
UserAgentPrefixes: []string{ UserAgentPrefixes: []string{
"facebookexternalhit/", "facebookexternalhit",
"meta-webindexer/", "meta-webindexer",
"meta-externalads/", "meta-externalads",
"meta-externalagent/", "meta-externalagent",
"meta-externalfetcher/", "meta-externalfetcher",
}, },
}, },
{ {
@@ -567,6 +569,68 @@ func defaultBotProviders() []botProvider {
CacheTTL: 24 * time.Hour, CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"}, IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"},
}, },
{
ID: "openai_gptbot_official",
Name: "GPTBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://openai.com/gptbot.json"},
UserAgentPrefixes: []string{
"gptbot",
},
},
{
ID: "openai_chatgpt_user_official",
Name: "ChatGPT-User",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://openai.com/chatgpt-user.json"},
UserAgentPrefixes: []string{
"chatgpt-user",
},
},
{
ID: "openai_oai_searchbot_official",
Name: "OAI-SearchBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://openai.com/searchbot.json"},
UserAgentPrefixes: []string{
"oai-searchbot",
},
},
{
ID: "perplexitybot_official",
Name: "PerplexityBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.perplexity.com/perplexitybot.json"},
UserAgentPrefixes: []string{
"perplexitybot",
},
},
{
ID: "perplexity_user_official",
Name: "Perplexity-User",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.perplexity.com/perplexity-user.json"},
UserAgentPrefixes: []string{
"perplexity-user",
},
},
{
ID: "yandex_official",
Name: "YandexBot",
Icon: "🤖",
CacheTTL: 24 * time.Hour,
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
},
} }
} }
@@ -582,14 +646,49 @@ func ipMatchesPrefixes(ip netip.Addr, prefixes []netip.Prefix) bool {
func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool { func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool {
for _, agent := range userAgents { for _, agent := range userAgents {
for _, prefix := range prefixes { for _, prefix := range prefixes {
if strings.HasPrefix(agent, prefix) { candidate := strings.ToLower(strings.TrimSpace(strings.TrimSuffix(prefix, "/")))
if candidate == "" {
continue
}
if strings.HasPrefix(agent, candidate) {
return true return true
} }
for _, token := range splitUserAgentTokens(agent) {
if token == candidate {
return true
}
}
} }
} }
return false return false
} }
func splitUserAgentTokens(userAgent string) []string {
parts := strings.FieldsFunc(userAgent, func(value rune) bool {
switch value {
case ' ', ';', '(', ')', ',', '\t':
return true
default:
return false
}
})
items := make([]string, 0, len(parts))
seen := make(map[string]struct{}, len(parts))
for _, part := range parts {
base := strings.TrimSpace(strings.SplitN(part, "/", 2)[0])
if base == "" {
continue
}
normalized := strings.ToLower(base)
if _, ok := seen[normalized]; ok {
continue
}
seen[normalized] = struct{}{}
items = append(items, normalized)
}
return items
}
func normalizeUserAgents(userAgents []string) []string { func normalizeUserAgents(userAgents []string) []string {
items := make([]string, 0, len(userAgents)) items := make([]string, 0, len(userAgents))
for _, userAgent := range userAgents { for _, userAgent := range userAgents {
@@ -634,6 +733,9 @@ func extractBotHintName(userAgent string) string {
continue continue
} }
normalized := strings.ToLower(base) normalized := strings.ToLower(base)
if strings.HasPrefix(normalized, "+") || strings.Contains(normalized, "@") {
continue
}
if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") { if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") {
return base return base
} }

View File

@@ -176,6 +176,147 @@ func TestInvestigateAddsBotHintFromUserAgent(t *testing.T) {
} }
} }
func TestParsePublishedNetworksSupportsCommentedGeofeed(t *testing.T) {
t.Parallel()
prefixes, err := parsePublishedNetworks([]byte(strings.Join([]string{
"# Publication date: Thu Mar 12 2026",
"31.13.78.0/24,NZ,NZ-AUK,Auckland,",
"2a03:2880:f061::/48,NZ,NZ-AUK,Auckland,",
"31.13.72.0/24,SE,,Bromma,",
"",
}, "\n")), "geofeed_csv", "https://example.test/geofeed")
if err != nil {
t.Fatalf("parse geofeed: %v", err)
}
if len(prefixes) != 3 {
t.Fatalf("expected 3 prefixes, got %d", len(prefixes))
}
got := []string{prefixes[0].String(), prefixes[1].String(), prefixes[2].String()}
want := []string{"31.13.78.0/24", "2a03:2880:f061::/48", "31.13.72.0/24"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Fatalf("unexpected geofeed prefixes: got %v want %v", got, want)
}
}
func TestInvestigateRecognizesOpenAIBotViaEmbeddedUserAgentToken(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/gptbot.json" {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/24"}]}`))
}))
defer server.Close()
svc := newService(
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
server.Client(),
&fakeResolver{},
log.New(testWriter{t}, "", 0),
[]botProvider{{
ID: "openai_gptbot_official",
Name: "GPTBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: time.Hour,
IPRangeURLs: []string{server.URL + "/gptbot.json"},
UserAgentPrefixes: []string{
"gptbot",
},
}},
map[string]string{},
)
investigation, err := svc.Investigate(context.Background(), "203.0.113.10", []string{"Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"})
if err != nil {
t.Fatalf("investigate ip: %v", err)
}
if investigation.Bot == nil || investigation.Bot.Name != "GPTBot" || !investigation.Bot.Verified {
t.Fatalf("expected verified GPTBot match, got %+v", investigation.Bot)
}
if investigation.Bot.Method != "user_agent+published_ranges" {
t.Fatalf("expected combined method, got %+v", investigation.Bot)
}
}
func TestInvestigateRecognizesPerplexityBotViaEmbeddedUserAgentToken(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/perplexitybot.json" {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"198.51.100.0/24"}]}`))
}))
defer server.Close()
svc := newService(
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
server.Client(),
&fakeResolver{},
log.New(testWriter{t}, "", 0),
[]botProvider{{
ID: "perplexitybot_official",
Name: "PerplexityBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: time.Hour,
IPRangeURLs: []string{server.URL + "/perplexitybot.json"},
UserAgentPrefixes: []string{
"perplexitybot",
},
}},
map[string]string{},
)
investigation, err := svc.Investigate(context.Background(), "198.51.100.42", []string{"Mozilla/5.0 (compatible; PerplexityBot/1.0; +https://www.perplexity.ai/perplexitybot)"})
if err != nil {
t.Fatalf("investigate ip: %v", err)
}
if investigation.Bot == nil || investigation.Bot.Name != "PerplexityBot" || !investigation.Bot.Verified {
t.Fatalf("expected verified PerplexityBot match, got %+v", investigation.Bot)
}
}
func TestInvestigateRecognizesYandexViaReverseDNS(t *testing.T) {
t.Parallel()
resolver := &fakeResolver{
reverse: map[string][]string{"203.0.113.55": {"spider-55.search.yandex.ru."}},
forward: map[string][]net.IPAddr{"spider-55.search.yandex.ru": {{IP: net.ParseIP("203.0.113.55")}}},
}
svc := newService(
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
http.DefaultClient,
resolver,
log.New(testWriter{t}, "", 0),
[]botProvider{{
ID: "yandex_official",
Name: "YandexBot",
Icon: "🤖",
CacheTTL: time.Hour,
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
}},
map[string]string{},
)
investigation, err := svc.Investigate(context.Background(), "203.0.113.55", []string{"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"})
if err != nil {
t.Fatalf("investigate ip: %v", err)
}
if investigation.Bot == nil || investigation.Bot.Name != "YandexBot" || !investigation.Bot.Verified {
t.Fatalf("expected verified YandexBot match, got %+v", investigation.Bot)
}
if investigation.Bot.Method != "reverse_dns+fcrdns" {
t.Fatalf("expected reverse DNS verification, got %+v", investigation.Bot)
}
}
func TestPublishedNetworksAreCached(t *testing.T) { func TestPublishedNetworksAreCached(t *testing.T) {
t.Parallel() t.Parallel()