You've already forked caddy-opnsense-blocker
Harden verified bot detection
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
package investigation
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
@@ -293,17 +294,14 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
|
||||
}
|
||||
return networks, nil
|
||||
case "geofeed_csv":
|
||||
reader := csv.NewReader(strings.NewReader(string(payload)))
|
||||
rows, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("decode geofeed payload from %s: %w", sourceURL, err)
|
||||
}
|
||||
networks := make([]netip.Prefix, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
if len(row) == 0 {
|
||||
scanner := bufio.NewScanner(bytes.NewReader(payload))
|
||||
networks := make([]netip.Prefix, 0, 64)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
candidate := strings.TrimSpace(row[0])
|
||||
candidate := strings.TrimSpace(strings.SplitN(line, ",", 2)[0])
|
||||
if candidate == "" || strings.HasPrefix(candidate, "#") {
|
||||
continue
|
||||
}
|
||||
@@ -313,6 +311,9 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
|
||||
}
|
||||
networks = append(networks, prefix.Masked())
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("scan geofeed payload from %s: %w", sourceURL, err)
|
||||
}
|
||||
return networks, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL)
|
||||
@@ -533,6 +534,7 @@ func defaultBotProviders() []botProvider {
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://www.bing.com/toolbox/bingbot.json"},
|
||||
ReverseDNSSuffixes: []string{".search.msn.com"},
|
||||
},
|
||||
{
|
||||
@@ -552,11 +554,11 @@ func defaultBotProviders() []botProvider {
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"},
|
||||
UserAgentPrefixes: []string{
|
||||
"facebookexternalhit/",
|
||||
"meta-webindexer/",
|
||||
"meta-externalads/",
|
||||
"meta-externalagent/",
|
||||
"meta-externalfetcher/",
|
||||
"facebookexternalhit",
|
||||
"meta-webindexer",
|
||||
"meta-externalads",
|
||||
"meta-externalagent",
|
||||
"meta-externalfetcher",
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -567,6 +569,68 @@ func defaultBotProviders() []botProvider {
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"},
|
||||
},
|
||||
{
|
||||
ID: "openai_gptbot_official",
|
||||
Name: "GPTBot",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://openai.com/gptbot.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"gptbot",
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "openai_chatgpt_user_official",
|
||||
Name: "ChatGPT-User",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://openai.com/chatgpt-user.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"chatgpt-user",
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "openai_oai_searchbot_official",
|
||||
Name: "OAI-SearchBot",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://openai.com/searchbot.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"oai-searchbot",
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "perplexitybot_official",
|
||||
Name: "PerplexityBot",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://www.perplexity.com/perplexitybot.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"perplexitybot",
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "perplexity_user_official",
|
||||
Name: "Perplexity-User",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
IPRangeURLs: []string{"https://www.perplexity.com/perplexity-user.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"perplexity-user",
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "yandex_official",
|
||||
Name: "YandexBot",
|
||||
Icon: "🤖",
|
||||
CacheTTL: 24 * time.Hour,
|
||||
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -582,14 +646,49 @@ func ipMatchesPrefixes(ip netip.Addr, prefixes []netip.Prefix) bool {
|
||||
func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool {
|
||||
for _, agent := range userAgents {
|
||||
for _, prefix := range prefixes {
|
||||
if strings.HasPrefix(agent, prefix) {
|
||||
candidate := strings.ToLower(strings.TrimSpace(strings.TrimSuffix(prefix, "/")))
|
||||
if candidate == "" {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(agent, candidate) {
|
||||
return true
|
||||
}
|
||||
for _, token := range splitUserAgentTokens(agent) {
|
||||
if token == candidate {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func splitUserAgentTokens(userAgent string) []string {
|
||||
parts := strings.FieldsFunc(userAgent, func(value rune) bool {
|
||||
switch value {
|
||||
case ' ', ';', '(', ')', ',', '\t':
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
})
|
||||
items := make([]string, 0, len(parts))
|
||||
seen := make(map[string]struct{}, len(parts))
|
||||
for _, part := range parts {
|
||||
base := strings.TrimSpace(strings.SplitN(part, "/", 2)[0])
|
||||
if base == "" {
|
||||
continue
|
||||
}
|
||||
normalized := strings.ToLower(base)
|
||||
if _, ok := seen[normalized]; ok {
|
||||
continue
|
||||
}
|
||||
seen[normalized] = struct{}{}
|
||||
items = append(items, normalized)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func normalizeUserAgents(userAgents []string) []string {
|
||||
items := make([]string, 0, len(userAgents))
|
||||
for _, userAgent := range userAgents {
|
||||
@@ -634,6 +733,9 @@ func extractBotHintName(userAgent string) string {
|
||||
continue
|
||||
}
|
||||
normalized := strings.ToLower(base)
|
||||
if strings.HasPrefix(normalized, "+") || strings.Contains(normalized, "@") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") {
|
||||
return base
|
||||
}
|
||||
|
||||
@@ -176,6 +176,147 @@ func TestInvestigateAddsBotHintFromUserAgent(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePublishedNetworksSupportsCommentedGeofeed(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
prefixes, err := parsePublishedNetworks([]byte(strings.Join([]string{
|
||||
"# Publication date: Thu Mar 12 2026",
|
||||
"31.13.78.0/24,NZ,NZ-AUK,Auckland,",
|
||||
"2a03:2880:f061::/48,NZ,NZ-AUK,Auckland,",
|
||||
"31.13.72.0/24,SE,,Bromma,",
|
||||
"",
|
||||
}, "\n")), "geofeed_csv", "https://example.test/geofeed")
|
||||
if err != nil {
|
||||
t.Fatalf("parse geofeed: %v", err)
|
||||
}
|
||||
if len(prefixes) != 3 {
|
||||
t.Fatalf("expected 3 prefixes, got %d", len(prefixes))
|
||||
}
|
||||
got := []string{prefixes[0].String(), prefixes[1].String(), prefixes[2].String()}
|
||||
want := []string{"31.13.78.0/24", "2a03:2880:f061::/48", "31.13.72.0/24"}
|
||||
if strings.Join(got, ",") != strings.Join(want, ",") {
|
||||
t.Fatalf("unexpected geofeed prefixes: got %v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvestigateRecognizesOpenAIBotViaEmbeddedUserAgentToken(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/gptbot.json" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/24"}]}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
svc := newService(
|
||||
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
|
||||
server.Client(),
|
||||
&fakeResolver{},
|
||||
log.New(testWriter{t}, "", 0),
|
||||
[]botProvider{{
|
||||
ID: "openai_gptbot_official",
|
||||
Name: "GPTBot",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: time.Hour,
|
||||
IPRangeURLs: []string{server.URL + "/gptbot.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"gptbot",
|
||||
},
|
||||
}},
|
||||
map[string]string{},
|
||||
)
|
||||
|
||||
investigation, err := svc.Investigate(context.Background(), "203.0.113.10", []string{"Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"})
|
||||
if err != nil {
|
||||
t.Fatalf("investigate ip: %v", err)
|
||||
}
|
||||
if investigation.Bot == nil || investigation.Bot.Name != "GPTBot" || !investigation.Bot.Verified {
|
||||
t.Fatalf("expected verified GPTBot match, got %+v", investigation.Bot)
|
||||
}
|
||||
if investigation.Bot.Method != "user_agent+published_ranges" {
|
||||
t.Fatalf("expected combined method, got %+v", investigation.Bot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvestigateRecognizesPerplexityBotViaEmbeddedUserAgentToken(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/perplexitybot.json" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
_, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"198.51.100.0/24"}]}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
svc := newService(
|
||||
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
|
||||
server.Client(),
|
||||
&fakeResolver{},
|
||||
log.New(testWriter{t}, "", 0),
|
||||
[]botProvider{{
|
||||
ID: "perplexitybot_official",
|
||||
Name: "PerplexityBot",
|
||||
Icon: "🤖",
|
||||
SourceFormat: "json_prefixes",
|
||||
CacheTTL: time.Hour,
|
||||
IPRangeURLs: []string{server.URL + "/perplexitybot.json"},
|
||||
UserAgentPrefixes: []string{
|
||||
"perplexitybot",
|
||||
},
|
||||
}},
|
||||
map[string]string{},
|
||||
)
|
||||
|
||||
investigation, err := svc.Investigate(context.Background(), "198.51.100.42", []string{"Mozilla/5.0 (compatible; PerplexityBot/1.0; +https://www.perplexity.ai/perplexitybot)"})
|
||||
if err != nil {
|
||||
t.Fatalf("investigate ip: %v", err)
|
||||
}
|
||||
if investigation.Bot == nil || investigation.Bot.Name != "PerplexityBot" || !investigation.Bot.Verified {
|
||||
t.Fatalf("expected verified PerplexityBot match, got %+v", investigation.Bot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvestigateRecognizesYandexViaReverseDNS(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
resolver := &fakeResolver{
|
||||
reverse: map[string][]string{"203.0.113.55": {"spider-55.search.yandex.ru."}},
|
||||
forward: map[string][]net.IPAddr{"spider-55.search.yandex.ru": {{IP: net.ParseIP("203.0.113.55")}}},
|
||||
}
|
||||
|
||||
svc := newService(
|
||||
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
|
||||
http.DefaultClient,
|
||||
resolver,
|
||||
log.New(testWriter{t}, "", 0),
|
||||
[]botProvider{{
|
||||
ID: "yandex_official",
|
||||
Name: "YandexBot",
|
||||
Icon: "🤖",
|
||||
CacheTTL: time.Hour,
|
||||
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
|
||||
}},
|
||||
map[string]string{},
|
||||
)
|
||||
|
||||
investigation, err := svc.Investigate(context.Background(), "203.0.113.55", []string{"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"})
|
||||
if err != nil {
|
||||
t.Fatalf("investigate ip: %v", err)
|
||||
}
|
||||
if investigation.Bot == nil || investigation.Bot.Name != "YandexBot" || !investigation.Bot.Verified {
|
||||
t.Fatalf("expected verified YandexBot match, got %+v", investigation.Bot)
|
||||
}
|
||||
if investigation.Bot.Method != "reverse_dns+fcrdns" {
|
||||
t.Fatalf("expected reverse DNS verification, got %+v", investigation.Bot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPublishedNetworksAreCached(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user