diff --git a/kakigoori/settings.py b/kakigoori/settings.py index 1745c06..dbb3725 100644 --- a/kakigoori/settings.py +++ b/kakigoori/settings.py @@ -34,7 +34,6 @@ INSTALLED_APPS = [ ] MIDDLEWARE = [ - "kakigoori.traffic_filtering.TrafficFiltering", "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", diff --git a/kakigoori/traffic_filtering.py b/kakigoori/traffic_filtering.py deleted file mode 100644 index e7ff85e..0000000 --- a/kakigoori/traffic_filtering.py +++ /dev/null @@ -1,71 +0,0 @@ -import json -import os -import re -from dataclasses import dataclass -from enum import Enum -from pathlib import Path - -from django.http import HttpRequest, HttpResponseForbidden - - -class TrafficRuleAction(Enum): - DENY = "DENY" - ALLOW = "ALLOW" - NO_ACTION = "NO_ACTION" - - -@dataclass -class TrafficRule: - name: str - user_agent_regex: re.Pattern - action: TrafficRuleAction - - def test_rule(self, request: HttpRequest): - user_agent = request.META.get("HTTP_USER_AGENT") or None - if not user_agent: - return TrafficRuleAction.DENY - - print(user_agent) - - if self.user_agent_regex.search(user_agent): - return self.action - - return TrafficRuleAction.NO_ACTION - - -class TrafficFiltering: - traffic_rules = [] - - def __init__(self, get_response): - self.get_response = get_response - - with open( - os.path.join(Path(__file__).resolve().parent, "traffic_rules.json") - ) as f: - traffic_rules_json = json.load(f) - - for rule in traffic_rules_json["rules"]: - # noinspection PyTypeChecker - self.traffic_rules.append( - TrafficRule( - rule["name"], - re.compile(rule["user_agent_regex"]), - TrafficRuleAction[rule["action"]], - ) - ) - - def __call__(self, request: HttpRequest): - for traffic_rule in self.traffic_rules: - print(f"Checking for {traffic_rule.name}") - action = traffic_rule.test_rule(request) - print(action) - match action: - case TrafficRuleAction.DENY: - return HttpResponseForbidden() - case TrafficRuleAction.ALLOW: - break - case TrafficRuleAction.NO_ACTION: - continue - - response = self.get_response(request) - return response diff --git a/kakigoori/traffic_rules.json b/kakigoori/traffic_rules.json deleted file mode 100644 index c25aeba..0000000 --- a/kakigoori/traffic_rules.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "rules": [ - { - "name": "Amazonbot", - "user_agent_regex": "Amazonbot", - "action": "DENY" - }, - { - "name": "googlebot", - "user_agent_regex": "\\+http://www\\.google\\.com/bot\\.html", - "action": "ALLOW" - }, - { - "name": "kagi", - "user_agent_regex": "\\+https://kagi\\.com/bot", - "action": "ALLOW" - }, - { - "name": "marginalia", - "user_agent_regex": "search\\.marginalia\\.nu", - "action": "ALLOW" - }, - { - "name": "mojeekbot", - "user_agent_regex": "http\\://www\\.mojeek\\.com/bot\\.html", - "action": "ALLOW" - }, - { - "name": "us-artificial-intelligence-scraper", - "user_agent_regex": "\\+https\\://github\\.com/US-Artificial-Intelligence/scraper", - "action": "DENY" - }, - { - "name": "lightpanda", - "user_agent_regex": "^Lightpanda/.*$", - "action": "DENY" - }, - { - "name": "headless-chrome", - "user_agent_regex": "HeadlessChrome", - "action": "DENY" - }, - { - "name": "headless-chromium", - "user_agent_regex": "HeadlessChromium", - "action": "DENY" - }, - { - "name": "imagesift", - "user_agent_regex": "\\+imagesift\\.com", - "action": "DENY" - }, - { - "name": "imagesift", - "user_agent_regex": "\\+https\\://opensiteexplorer\\.org/dotbot", - "action": "DENY" - }, - { - "name": "SemrushBot", - "user_agent_regex": "\\+http\\://www\\.semrush\\.com/bot\\.html", - "action": "DENY" - }, - { - "name": "Facebook", - "user_agent_regex": "\\+https?://(?:www|developers).facebook.com", - "action": "DENY" - }, - { - "name": "Bytedance", - "user_agent_regex": "Bytespider", - "action": "DENY" - }, - { - "name": "MJ12Bot", - "user_agent_regex": "http://mj12bot\\.com/", - "action": "DENY" - }, - { - "name": "Dataprovider.com", - "user_agent_regex": "Dataprovider\\.com", - "action": "DENY" - }, - { - "name": "Dataprovider.com", - "user_agent_regex": "Dataprovider\\.com", - "action": "DENY" - }, - { - "name": "BitSightBot", - "user_agent_regex": "BitSightBot", - "action": "DENY" - }, - { - "name": "babbar.tech", - "user_agent_regex": "\\+http\\://babbar\\.tech/crawler", - "action": "DENY" - }, - { - "name": "censys", - "user_agent_regex": "\\+https://about\\.censys\\.io/", - "action": "DENY" - }, - { - "name": "censys", - "user_agent_regex": "\\+https://about\\.censys\\.io/", - "action": "DENY" - }, - { - "name": "Baidu", - "user_agent_regex": "Baiduspider", - "action": "DENY" - }, - { - "name": "Expanse", - "user_agent_regex": "scaninfo@paloaltonetworks\\.com", - "action": "DENY" - }, - { - "name": "InternetMeasurement", - "user_agent_regex": "InternetMeasurement", - "action": "DENY" - }, - { - "name": "OtherAI (ai.robots.txt)", - "user_agent_regex": "(AI2Bot|Ai2Bot\\-Dolma|Amazonbot|anthropic\\-ai|Applebot|Applebot\\-Extended|Brightbot\\ 1\\.0|Bytespider|CCBot|ChatGPT\\-User|Claude\\-Web|ClaudeBot|cohere\\-ai|cohere\\-training\\-data\\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\\-Extended|GoogleOther|GoogleOther\\-Image|GoogleOther\\-Video|GPTBot|iaskspider/2\\.0|ICC\\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\\ Bot|Meta\\-ExternalAgent|Meta\\-ExternalFetcher|OAI\\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\\-OCOB|SemrushBot\\-SWA|Sidetrade\\ indexer\\ bot|Timpibot|VelenPublicWebCrawler|Webzio\\-Extended|YouBot)", - "action": "DENY" - } - ] -} \ No newline at end of file