You've already forked caddy-opnsense-blocker
Harden verified bot detection
This commit is contained in:
@@ -62,7 +62,8 @@
|
|||||||
- The background worker only fills in missing investigations; it does not continuously re-check cached intelligence.
|
- The background worker only fills in missing investigations; it does not continuously re-check cached intelligence.
|
||||||
- Opening an IP details page reuses the cached investigation.
|
- Opening an IP details page reuses the cached investigation.
|
||||||
- `Refresh investigation` is the explicit action that forces a new lookup.
|
- `Refresh investigation` is the explicit action that forces a new lookup.
|
||||||
- Verified bot detection currently uses built-in provider logic for Google, Bing, Apple, Meta, and DuckDuckGo.
|
- Verified bot detection currently uses built-in provider logic for Google, Bing, Apple, Meta, DuckDuckGo, OpenAI, Perplexity, and Yandex.
|
||||||
|
- When an official crawler publishes IP ranges, the daemon prefers those ranges and can combine them with User-Agent verification when the provider documents distinct bot user agents.
|
||||||
- When an address is not identified as a verified bot, the daemon can collect reverse DNS, forward-confirmed reverse DNS, RDAP registration details, and Spamhaus DNSBL status.
|
- When an address is not identified as a verified bot, the daemon can collect reverse DNS, forward-confirmed reverse DNS, RDAP registration details, and Spamhaus DNSBL status.
|
||||||
|
|
||||||
## Caddy log requirements
|
## Caddy log requirements
|
||||||
|
|||||||
@@ -82,6 +82,10 @@ Current releases can collect:
|
|||||||
- RDAP registration details such as network name, organization, country, prefix, and abuse contact
|
- RDAP registration details such as network name, organization, country, prefix, and abuse contact
|
||||||
- Spamhaus listed or not listed status
|
- Spamhaus listed or not listed status
|
||||||
|
|
||||||
|
Built-in verified bot providers currently cover Google, Bing, Apple, Meta, DuckDuckGo, OpenAI, Perplexity, and Yandex.
|
||||||
|
|
||||||
|
When a provider publishes official crawler ranges, the daemon uses those published ranges as the source of truth and can also require a matching User-Agent token for provider families that expose several distinct crawlers.
|
||||||
|
|
||||||
## `opnsense`
|
## `opnsense`
|
||||||
|
|
||||||
Controls the optional firewall backend.
|
Controls the optional firewall backend.
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
package investigation
|
package investigation
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/csv"
|
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
@@ -293,17 +294,14 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
|
|||||||
}
|
}
|
||||||
return networks, nil
|
return networks, nil
|
||||||
case "geofeed_csv":
|
case "geofeed_csv":
|
||||||
reader := csv.NewReader(strings.NewReader(string(payload)))
|
scanner := bufio.NewScanner(bytes.NewReader(payload))
|
||||||
rows, err := reader.ReadAll()
|
networks := make([]netip.Prefix, 0, 64)
|
||||||
if err != nil {
|
for scanner.Scan() {
|
||||||
return nil, fmt.Errorf("decode geofeed payload from %s: %w", sourceURL, err)
|
line := strings.TrimSpace(scanner.Text())
|
||||||
}
|
if line == "" || strings.HasPrefix(line, "#") {
|
||||||
networks := make([]netip.Prefix, 0, len(rows))
|
|
||||||
for _, row := range rows {
|
|
||||||
if len(row) == 0 {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
candidate := strings.TrimSpace(row[0])
|
candidate := strings.TrimSpace(strings.SplitN(line, ",", 2)[0])
|
||||||
if candidate == "" || strings.HasPrefix(candidate, "#") {
|
if candidate == "" || strings.HasPrefix(candidate, "#") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -313,6 +311,9 @@ func parsePublishedNetworks(payload []byte, sourceFormat string, sourceURL strin
|
|||||||
}
|
}
|
||||||
networks = append(networks, prefix.Masked())
|
networks = append(networks, prefix.Masked())
|
||||||
}
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return nil, fmt.Errorf("scan geofeed payload from %s: %w", sourceURL, err)
|
||||||
|
}
|
||||||
return networks, nil
|
return networks, nil
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL)
|
return nil, fmt.Errorf("unsupported source format %q for %s", sourceFormat, sourceURL)
|
||||||
@@ -533,6 +534,7 @@ func defaultBotProviders() []botProvider {
|
|||||||
Icon: "🤖",
|
Icon: "🤖",
|
||||||
SourceFormat: "json_prefixes",
|
SourceFormat: "json_prefixes",
|
||||||
CacheTTL: 24 * time.Hour,
|
CacheTTL: 24 * time.Hour,
|
||||||
|
IPRangeURLs: []string{"https://www.bing.com/toolbox/bingbot.json"},
|
||||||
ReverseDNSSuffixes: []string{".search.msn.com"},
|
ReverseDNSSuffixes: []string{".search.msn.com"},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -552,11 +554,11 @@ func defaultBotProviders() []botProvider {
|
|||||||
CacheTTL: 24 * time.Hour,
|
CacheTTL: 24 * time.Hour,
|
||||||
IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"},
|
IPRangeURLs: []string{"https://www.facebook.com/peering/geofeed"},
|
||||||
UserAgentPrefixes: []string{
|
UserAgentPrefixes: []string{
|
||||||
"facebookexternalhit/",
|
"facebookexternalhit",
|
||||||
"meta-webindexer/",
|
"meta-webindexer",
|
||||||
"meta-externalads/",
|
"meta-externalads",
|
||||||
"meta-externalagent/",
|
"meta-externalagent",
|
||||||
"meta-externalfetcher/",
|
"meta-externalfetcher",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -567,6 +569,68 @@ func defaultBotProviders() []botProvider {
|
|||||||
CacheTTL: 24 * time.Hour,
|
CacheTTL: 24 * time.Hour,
|
||||||
IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"},
|
IPRangeURLs: []string{"https://duckduckgo.com/duckduckbot.json"},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
ID: "openai_gptbot_official",
|
||||||
|
Name: "GPTBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: 24 * time.Hour,
|
||||||
|
IPRangeURLs: []string{"https://openai.com/gptbot.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"gptbot",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "openai_chatgpt_user_official",
|
||||||
|
Name: "ChatGPT-User",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: 24 * time.Hour,
|
||||||
|
IPRangeURLs: []string{"https://openai.com/chatgpt-user.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"chatgpt-user",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "openai_oai_searchbot_official",
|
||||||
|
Name: "OAI-SearchBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: 24 * time.Hour,
|
||||||
|
IPRangeURLs: []string{"https://openai.com/searchbot.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"oai-searchbot",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "perplexitybot_official",
|
||||||
|
Name: "PerplexityBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: 24 * time.Hour,
|
||||||
|
IPRangeURLs: []string{"https://www.perplexity.com/perplexitybot.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"perplexitybot",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "perplexity_user_official",
|
||||||
|
Name: "Perplexity-User",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: 24 * time.Hour,
|
||||||
|
IPRangeURLs: []string{"https://www.perplexity.com/perplexity-user.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"perplexity-user",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "yandex_official",
|
||||||
|
Name: "YandexBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
CacheTTL: 24 * time.Hour,
|
||||||
|
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -582,14 +646,49 @@ func ipMatchesPrefixes(ip netip.Addr, prefixes []netip.Prefix) bool {
|
|||||||
func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool {
|
func userAgentMatchesPrefixes(userAgents []string, prefixes []string) bool {
|
||||||
for _, agent := range userAgents {
|
for _, agent := range userAgents {
|
||||||
for _, prefix := range prefixes {
|
for _, prefix := range prefixes {
|
||||||
if strings.HasPrefix(agent, prefix) {
|
candidate := strings.ToLower(strings.TrimSpace(strings.TrimSuffix(prefix, "/")))
|
||||||
|
if candidate == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(agent, candidate) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
for _, token := range splitUserAgentTokens(agent) {
|
||||||
|
if token == candidate {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func splitUserAgentTokens(userAgent string) []string {
|
||||||
|
parts := strings.FieldsFunc(userAgent, func(value rune) bool {
|
||||||
|
switch value {
|
||||||
|
case ' ', ';', '(', ')', ',', '\t':
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
items := make([]string, 0, len(parts))
|
||||||
|
seen := make(map[string]struct{}, len(parts))
|
||||||
|
for _, part := range parts {
|
||||||
|
base := strings.TrimSpace(strings.SplitN(part, "/", 2)[0])
|
||||||
|
if base == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
normalized := strings.ToLower(base)
|
||||||
|
if _, ok := seen[normalized]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[normalized] = struct{}{}
|
||||||
|
items = append(items, normalized)
|
||||||
|
}
|
||||||
|
return items
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeUserAgents(userAgents []string) []string {
|
func normalizeUserAgents(userAgents []string) []string {
|
||||||
items := make([]string, 0, len(userAgents))
|
items := make([]string, 0, len(userAgents))
|
||||||
for _, userAgent := range userAgents {
|
for _, userAgent := range userAgents {
|
||||||
@@ -634,6 +733,9 @@ func extractBotHintName(userAgent string) string {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
normalized := strings.ToLower(base)
|
normalized := strings.ToLower(base)
|
||||||
|
if strings.HasPrefix(normalized, "+") || strings.Contains(normalized, "@") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") {
|
if strings.Contains(normalized, "bot") || strings.Contains(normalized, "crawler") || strings.Contains(normalized, "spider") || strings.Contains(normalized, "slurp") || strings.Contains(normalized, "fetcher") || strings.Contains(normalized, "indexer") || strings.Contains(normalized, "preview") || strings.Contains(normalized, "externalhit") {
|
||||||
return base
|
return base
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -176,6 +176,147 @@ func TestInvestigateAddsBotHintFromUserAgent(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParsePublishedNetworksSupportsCommentedGeofeed(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
prefixes, err := parsePublishedNetworks([]byte(strings.Join([]string{
|
||||||
|
"# Publication date: Thu Mar 12 2026",
|
||||||
|
"31.13.78.0/24,NZ,NZ-AUK,Auckland,",
|
||||||
|
"2a03:2880:f061::/48,NZ,NZ-AUK,Auckland,",
|
||||||
|
"31.13.72.0/24,SE,,Bromma,",
|
||||||
|
"",
|
||||||
|
}, "\n")), "geofeed_csv", "https://example.test/geofeed")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("parse geofeed: %v", err)
|
||||||
|
}
|
||||||
|
if len(prefixes) != 3 {
|
||||||
|
t.Fatalf("expected 3 prefixes, got %d", len(prefixes))
|
||||||
|
}
|
||||||
|
got := []string{prefixes[0].String(), prefixes[1].String(), prefixes[2].String()}
|
||||||
|
want := []string{"31.13.78.0/24", "2a03:2880:f061::/48", "31.13.72.0/24"}
|
||||||
|
if strings.Join(got, ",") != strings.Join(want, ",") {
|
||||||
|
t.Fatalf("unexpected geofeed prefixes: got %v want %v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInvestigateRecognizesOpenAIBotViaEmbeddedUserAgentToken(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.URL.Path != "/gptbot.json" {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/24"}]}`))
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
svc := newService(
|
||||||
|
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
|
||||||
|
server.Client(),
|
||||||
|
&fakeResolver{},
|
||||||
|
log.New(testWriter{t}, "", 0),
|
||||||
|
[]botProvider{{
|
||||||
|
ID: "openai_gptbot_official",
|
||||||
|
Name: "GPTBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: time.Hour,
|
||||||
|
IPRangeURLs: []string{server.URL + "/gptbot.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"gptbot",
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
map[string]string{},
|
||||||
|
)
|
||||||
|
|
||||||
|
investigation, err := svc.Investigate(context.Background(), "203.0.113.10", []string{"Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("investigate ip: %v", err)
|
||||||
|
}
|
||||||
|
if investigation.Bot == nil || investigation.Bot.Name != "GPTBot" || !investigation.Bot.Verified {
|
||||||
|
t.Fatalf("expected verified GPTBot match, got %+v", investigation.Bot)
|
||||||
|
}
|
||||||
|
if investigation.Bot.Method != "user_agent+published_ranges" {
|
||||||
|
t.Fatalf("expected combined method, got %+v", investigation.Bot)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInvestigateRecognizesPerplexityBotViaEmbeddedUserAgentToken(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.URL.Path != "/perplexitybot.json" {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"198.51.100.0/24"}]}`))
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
svc := newService(
|
||||||
|
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
|
||||||
|
server.Client(),
|
||||||
|
&fakeResolver{},
|
||||||
|
log.New(testWriter{t}, "", 0),
|
||||||
|
[]botProvider{{
|
||||||
|
ID: "perplexitybot_official",
|
||||||
|
Name: "PerplexityBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
SourceFormat: "json_prefixes",
|
||||||
|
CacheTTL: time.Hour,
|
||||||
|
IPRangeURLs: []string{server.URL + "/perplexitybot.json"},
|
||||||
|
UserAgentPrefixes: []string{
|
||||||
|
"perplexitybot",
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
map[string]string{},
|
||||||
|
)
|
||||||
|
|
||||||
|
investigation, err := svc.Investigate(context.Background(), "198.51.100.42", []string{"Mozilla/5.0 (compatible; PerplexityBot/1.0; +https://www.perplexity.ai/perplexitybot)"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("investigate ip: %v", err)
|
||||||
|
}
|
||||||
|
if investigation.Bot == nil || investigation.Bot.Name != "PerplexityBot" || !investigation.Bot.Verified {
|
||||||
|
t.Fatalf("expected verified PerplexityBot match, got %+v", investigation.Bot)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInvestigateRecognizesYandexViaReverseDNS(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
resolver := &fakeResolver{
|
||||||
|
reverse: map[string][]string{"203.0.113.55": {"spider-55.search.yandex.ru."}},
|
||||||
|
forward: map[string][]net.IPAddr{"spider-55.search.yandex.ru": {{IP: net.ParseIP("203.0.113.55")}}},
|
||||||
|
}
|
||||||
|
|
||||||
|
svc := newService(
|
||||||
|
config.InvestigationConfig{Enabled: true, Timeout: config.Duration{Duration: time.Second}, UserAgent: "test-agent", SpamhausEnabled: true},
|
||||||
|
http.DefaultClient,
|
||||||
|
resolver,
|
||||||
|
log.New(testWriter{t}, "", 0),
|
||||||
|
[]botProvider{{
|
||||||
|
ID: "yandex_official",
|
||||||
|
Name: "YandexBot",
|
||||||
|
Icon: "🤖",
|
||||||
|
CacheTTL: time.Hour,
|
||||||
|
ReverseDNSSuffixes: []string{".yandex.ru", ".yandex.net", ".yandex.com"},
|
||||||
|
}},
|
||||||
|
map[string]string{},
|
||||||
|
)
|
||||||
|
|
||||||
|
investigation, err := svc.Investigate(context.Background(), "203.0.113.55", []string{"Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("investigate ip: %v", err)
|
||||||
|
}
|
||||||
|
if investigation.Bot == nil || investigation.Bot.Name != "YandexBot" || !investigation.Bot.Verified {
|
||||||
|
t.Fatalf("expected verified YandexBot match, got %+v", investigation.Bot)
|
||||||
|
}
|
||||||
|
if investigation.Bot.Method != "reverse_dns+fcrdns" {
|
||||||
|
t.Fatalf("expected reverse DNS verification, got %+v", investigation.Bot)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPublishedNetworksAreCached(t *testing.T) {
|
func TestPublishedNetworksAreCached(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user