2

Harden verified bot detection

This commit is contained in:
2026-03-12 16:45:11 +01:00
parent 0bc2d2b689
commit b7943e69db
4 changed files with 265 additions and 17 deletions

View File

@@ -1,8 +1,9 @@
package investigation
import (
"bufio"
"bytes"
"context"
"encoding/csv"
"encoding/hex"
"encoding/json"
"errors"
@@ -293,17 +294,14 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
}
return networks, nil
case "geofeed_csv":
reader := csv.NewReader(strings.NewReader(string(payload)))
rows, err := reader.ReadAll()
if err != nil {
return nil, fmt.Errorf("decode geofeed payload from %s: %w", sourceURL, err)
}
networks := make([]netip.Prefix, 0, len(rows))
for _, row := range rows {
if len(row) == 0 {
scanner := bufio.NewScanner(bytes.NewReader(payload))
networks := make([]netip.Prefix, 0, 64)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
candidate := strings.TrimSpace(row[0])
candidate := strings.TrimSpace(strings.SplitN(line, ",", 2)[0])
if candidate == "" || strings.HasPrefix(candidate, "#") {
continue
}
@@ -313,6 +311,9 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
}
networks = append(networks, prefix.Masked())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("scan geofeed payload from %s: %w", sourceURL, err)
}
return networks, nil
default:
return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL)
@@ -533,6 +534,7 @@ func defaultBotProviders() []botProvider {
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.bing.com/toolbox/bingbot.json"},
ReverseDNSSuffixes: []string{".search.msn.com"},
},
{
@@ -552,11 +554,11 @@ func defaultBotProviders() []botProvider {
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"},
UserAgentPrefixes: []string{
"facebookexternalhit/",
"meta-webindexer/",
"meta-externalads/",
"meta-externalagent/",
"meta-externalfetcher/",
"facebookexternalhit",
"meta-webindexer",
"meta-externalads",
"meta-externalagent",
"meta-externalfetcher",
},
},
{
@@ -567,6 +569,68 @@ func defaultBotProviders() []botProvider {
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"},
},
{
ID: "openai_gptbot_official",
Name: "GPTBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://openai.com/gptbot.json"},
UserAgentPrefixes: []string{
"gptbot",
},
},
{
ID: "openai_chatgpt_user_official",
Name: "ChatGPT-User",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://openai.com/chatgpt-user.json"},
UserAgentPrefixes: []string{
"chatgpt-user",
},
},
{
ID: "openai_oai_searchbot_official",
Name: "OAI-SearchBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://openai.com/searchbot.json"},
UserAgentPrefixes: []string{
"oai-searchbot",
},
},
{
ID: "perplexitybot_official",
Name: "PerplexityBot",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.perplexity.com/perplexitybot.json"},
UserAgentPrefixes: []string{
"perplexitybot",
},
},
{
ID: "perplexity_user_official",
Name: "Perplexity-User",
Icon: "🤖",
SourceFormat: "json_prefixes",
CacheTTL: 24 * time.Hour,
IPRangeURLs: []string{"https://www.perplexity.com/perplexity-user.json"},
UserAgentPrefixes: []string{
"perplexity-user",
},
},
{
ID: "yandex_official",
Name: "YandexBot",
Icon: "🤖",
CacheTTL: 24 * time.Hour,
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
},
}
}
@@ -582,14 +646,49 @@ func ipMatchesPrefixes(ip netip.Addr, prefixes []netip.Prefix) bool {
func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool {
for _, agent := range userAgents {
for _, prefix := range prefixes {
if strings.HasPrefix(agent, prefix) {
candidate := strings.ToLower(strings.TrimSpace(strings.TrimSuffix(prefix, "/")))
if candidate == "" {
continue
}
if strings.HasPrefix(agent, candidate) {
return true
}
for _, token := range splitUserAgentTokens(agent) {
if token == candidate {
return true
}
}
}
}
return false
}
func splitUserAgentTokens(userAgent string) []string {
parts := strings.FieldsFunc(userAgent, func(value rune) bool {
switch value {
case ' ', ';', '(', ')', ',', '\t':
return true
default:
return false
}
})
items := make([]string, 0, len(parts))
seen := make(map[string]struct{}, len(parts))
for _, part := range parts {
base := strings.TrimSpace(strings.SplitN(part, "/", 2)[0])
if base == "" {
continue
}
normalized := strings.ToLower(base)
if _, ok := seen[normalized]; ok {
continue
}
seen[normalized] = struct{}{}
items = append(items, normalized)
}
return items
}
func normalizeUserAgents(userAgents []string) []string {
items := make([]string, 0, len(userAgents))
for _, userAgent := range userAgents {
@@ -634,6 +733,9 @@ func extractBotHintName(userAgent string) string {
continue
}
normalized := strings.ToLower(base)
if strings.HasPrefix(normalized, "+") || strings.Contains(normalized, "@") {
continue
}
if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") {
return base
}