diff --git a/kakigoori/settings.py b/kakigoori/settings.py index dbb3725..1745c06 100644 --- a/kakigoori/settings.py +++ b/kakigoori/settings.py @@ -34,6 +34,7 @@ INSTALLED_APPS = [ ] MIDDLEWARE = [ + "kakigoori.traffic_filtering.TrafficFiltering", "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", diff --git a/kakigoori/traffic_filtering.py b/kakigoori/traffic_filtering.py new file mode 100644 index 0000000..2ed704b --- /dev/null +++ b/kakigoori/traffic_filtering.py @@ -0,0 +1,72 @@ +import json +import os +import re +from dataclasses import dataclass +from enum import Enum +from pathlib import Path + +from django.http import HttpRequest, HttpResponseForbidden + + +class TrafficRuleAction(Enum): + DENY = "DENY" + ALLOW = "ALLOW" + NO_ACTION = "NO_ACTION" + + +@dataclass +class TrafficRule: + name: str + user_agent_regex: re.Pattern + action: TrafficRuleAction + + def test_rule(self, request: HttpRequest): + user_agent = request.META.get("HTTP_USER_AGENT") or None + if user_agent is None: + return TrafficRuleAction.DENY + + print(user_agent) + + if self.user_agent_regex.search(user_agent) is not None: + print("FOUND, RETURNING ACTINO") + return self.action + + return TrafficRuleAction.NO_ACTION + + +class TrafficFiltering: + traffic_rules = [] + + def __init__(self, get_response): + self.get_response = get_response + + with open( + os.path.join(Path(__file__).resolve().parent, "traffic_rules.json") + ) as f: + traffic_rules_json = json.load(f) + + for rule in traffic_rules_json["rules"]: + # noinspection PyTypeChecker + self.traffic_rules.append( + TrafficRule( + rule["name"], + re.compile(rule["user_agent_regex"]), + TrafficRuleAction[rule["action"]], + ) + ) + + def __call__(self, request: HttpRequest): + for traffic_rule in self.traffic_rules: + print(f"Checking for {traffic_rule.name}") + action = traffic_rule.test_rule(request) + print(action) + match action: + case TrafficRuleAction.DENY: + return HttpResponseForbidden() + case TrafficRuleAction.ALLOW: + break + case TrafficRuleAction.NO_ACTION: + continue + + response = self.get_response(request) + return response diff --git a/kakigoori/traffic_rules.json b/kakigoori/traffic_rules.json new file mode 100644 index 0000000..35bb1e3 --- /dev/null +++ b/kakigoori/traffic_rules.json @@ -0,0 +1,124 @@ +{ + "rules": [ + { + "name": "Amazonbot", + "user_agent_regex": "Amazonbot", + "action": "DENY" + }, + { + "name": "googlebot", + "user_agent_regex": "\\+http://www\\.google\\.com/bot\\.html", + "action": "ALLOW" + }, + { + "name": "kagi", + "user_agent_regex": "\\+https://kagi\\.com/bot", + "action": "ALLOW" + }, + { + "name": "marginalia", + "user_agent_regex": "search\\.marginalia\\.nu", + "action": "ALLOW" + }, + { + "name": "mojeekbot", + "user_agent_regex": "http\\://www\\.mojeek\\.com/bot\\.html", + "action": "ALLOW" + }, + { + "name": "us-artificial-intelligence-scraper", + "user_agent_regex": "\\+https\\://github\\.com/US-Artificial-Intelligence/scraper", + "action": "DENY" + }, + { + "name": "lightpanda", + "user_agent_regex": "^Lightpanda/.*$", + "action": "DENY" + }, + { + "name": "headless-chrome", + "user_agent_regex": "HeadlessChrome", + "action": "DENY" + }, + { + "name": "headless-chromium", + "user_agent_regex": "HeadlessChromium", + "action": "DENY" + }, + { + "name": "imagesift", + "user_agent_regex": "\\+imagesift\\.com", + "action": "DENY" + }, + { + "name": "imagesift", + "user_agent_regex": "\\+https\\://opensiteexplorer\\.org/dotbot", + "action": "DENY" + }, + { + "name": "SemrushBot", + "user_agent_regex": "\\+http\\://www\\.semrush\\.com/bot\\.html", + "action": "DENY" + }, + { + "name": "Facebook", + "user_agent_regex": "\\+https?://(?:www|developers).facebook.com", + "action": "DENY" + }, + { + "name": "Bytedance", + "user_agent_regex": "Bytespider", + "action": "DENY" + }, + { + "name": "MJ12Bot", + "user_agent_regex": "http://mj12bot\\.com/", + "action": "DENY" + }, + { + "name": "Dataprovider.com", + "user_agent_regex": "Dataprovider\\.com", + "action": "DENY" + }, + { + "name": "Dataprovider.com", + "user_agent_regex": "Dataprovider\\.com", + "action": "DENY" + }, + { + "name": "BitSightBot", + "user_agent_regex": "BitSightBot", + "action": "DENY" + }, + { + "name": "babbar.tech", + "user_agent_regex": "\\+http\\://babbar\\.tech/crawler", + "action": "DENY" + }, + { + "name": "censys", + "user_agent_regex": "\\+https://about\\.censys\\.io/", + "action": "DENY" + }, + { + "name": "censys", + "user_agent_regex": "\\+https://about\\.censys\\.io/", + "action": "DENY" + }, + { + "name": "Baidu", + "user_agent_regex": "Baiduspider", + "action": "DENY" + }, + { + "name": "Expanse", + "user_agent_regex": "scaninfo@paloaltonetworks\\.com", + "action": "DENY" + }, + { + "name": "InternetMeasurement", + "user_agent_regex": "InternetMeasurement", + "action": "DENY" + } + ] +} \ No newline at end of file