Added traffic filtering against bots
This commit is contained in:
@@ -34,6 +34,7 @@ INSTALLED_APPS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
MIDDLEWARE = [
|
MIDDLEWARE = [
|
||||||
|
"kakigoori.traffic_filtering.TrafficFiltering",
|
||||||
"django.middleware.security.SecurityMiddleware",
|
"django.middleware.security.SecurityMiddleware",
|
||||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||||
"django.middleware.common.CommonMiddleware",
|
"django.middleware.common.CommonMiddleware",
|
||||||
|
72
kakigoori/traffic_filtering.py
Normal file
72
kakigoori/traffic_filtering.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.http import HttpRequest, HttpResponseForbidden
|
||||||
|
|
||||||
|
|
||||||
|
class TrafficRuleAction(Enum):
|
||||||
|
DENY = "DENY"
|
||||||
|
ALLOW = "ALLOW"
|
||||||
|
NO_ACTION = "NO_ACTION"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TrafficRule:
|
||||||
|
name: str
|
||||||
|
user_agent_regex: re.Pattern
|
||||||
|
action: TrafficRuleAction
|
||||||
|
|
||||||
|
def test_rule(self, request: HttpRequest):
|
||||||
|
user_agent = request.META.get("HTTP_USER_AGENT") or None
|
||||||
|
if user_agent is None:
|
||||||
|
return TrafficRuleAction.DENY
|
||||||
|
|
||||||
|
print(user_agent)
|
||||||
|
|
||||||
|
if self.user_agent_regex.search(user_agent) is not None:
|
||||||
|
print("FOUND, RETURNING ACTINO")
|
||||||
|
return self.action
|
||||||
|
|
||||||
|
return TrafficRuleAction.NO_ACTION
|
||||||
|
|
||||||
|
|
||||||
|
class TrafficFiltering:
|
||||||
|
traffic_rules = []
|
||||||
|
|
||||||
|
def __init__(self, get_response):
|
||||||
|
self.get_response = get_response
|
||||||
|
|
||||||
|
with open(
|
||||||
|
os.path.join(Path(__file__).resolve().parent, "traffic_rules.json")
|
||||||
|
) as f:
|
||||||
|
traffic_rules_json = json.load(f)
|
||||||
|
|
||||||
|
for rule in traffic_rules_json["rules"]:
|
||||||
|
# noinspection PyTypeChecker
|
||||||
|
self.traffic_rules.append(
|
||||||
|
TrafficRule(
|
||||||
|
rule["name"],
|
||||||
|
re.compile(rule["user_agent_regex"]),
|
||||||
|
TrafficRuleAction[rule["action"]],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(self, request: HttpRequest):
|
||||||
|
for traffic_rule in self.traffic_rules:
|
||||||
|
print(f"Checking for {traffic_rule.name}")
|
||||||
|
action = traffic_rule.test_rule(request)
|
||||||
|
print(action)
|
||||||
|
match action:
|
||||||
|
case TrafficRuleAction.DENY:
|
||||||
|
return HttpResponseForbidden()
|
||||||
|
case TrafficRuleAction.ALLOW:
|
||||||
|
break
|
||||||
|
case TrafficRuleAction.NO_ACTION:
|
||||||
|
continue
|
||||||
|
|
||||||
|
response = self.get_response(request)
|
||||||
|
return response
|
124
kakigoori/traffic_rules.json
Normal file
124
kakigoori/traffic_rules.json
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
{
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"name": "Amazonbot",
|
||||||
|
"user_agent_regex": "Amazonbot",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "googlebot",
|
||||||
|
"user_agent_regex": "\\+http://www\\.google\\.com/bot\\.html",
|
||||||
|
"action": "ALLOW"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "kagi",
|
||||||
|
"user_agent_regex": "\\+https://kagi\\.com/bot",
|
||||||
|
"action": "ALLOW"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "marginalia",
|
||||||
|
"user_agent_regex": "search\\.marginalia\\.nu",
|
||||||
|
"action": "ALLOW"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mojeekbot",
|
||||||
|
"user_agent_regex": "http\\://www\\.mojeek\\.com/bot\\.html",
|
||||||
|
"action": "ALLOW"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "us-artificial-intelligence-scraper",
|
||||||
|
"user_agent_regex": "\\+https\\://github\\.com/US-Artificial-Intelligence/scraper",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "lightpanda",
|
||||||
|
"user_agent_regex": "^Lightpanda/.*$",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "headless-chrome",
|
||||||
|
"user_agent_regex": "HeadlessChrome",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "headless-chromium",
|
||||||
|
"user_agent_regex": "HeadlessChromium",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "imagesift",
|
||||||
|
"user_agent_regex": "\\+imagesift\\.com",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "imagesift",
|
||||||
|
"user_agent_regex": "\\+https\\://opensiteexplorer\\.org/dotbot",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "SemrushBot",
|
||||||
|
"user_agent_regex": "\\+http\\://www\\.semrush\\.com/bot\\.html",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Facebook",
|
||||||
|
"user_agent_regex": "\\+https?://(?:www|developers).facebook.com",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Bytedance",
|
||||||
|
"user_agent_regex": "Bytespider",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "MJ12Bot",
|
||||||
|
"user_agent_regex": "http://mj12bot\\.com/",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Dataprovider.com",
|
||||||
|
"user_agent_regex": "Dataprovider\\.com",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Dataprovider.com",
|
||||||
|
"user_agent_regex": "Dataprovider\\.com",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "BitSightBot",
|
||||||
|
"user_agent_regex": "BitSightBot",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "babbar.tech",
|
||||||
|
"user_agent_regex": "\\+http\\://babbar\\.tech/crawler",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "censys",
|
||||||
|
"user_agent_regex": "\\+https://about\\.censys\\.io/",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "censys",
|
||||||
|
"user_agent_regex": "\\+https://about\\.censys\\.io/",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Baidu",
|
||||||
|
"user_agent_regex": "Baiduspider",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Expanse",
|
||||||
|
"user_agent_regex": "scaninfo@paloaltonetworks\\.com",
|
||||||
|
"action": "DENY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "InternetMeasurement",
|
||||||
|
"user_agent_regex": "InternetMeasurement",
|
||||||
|
"action": "DENY"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
Reference in New Issue
Block a user