code optimizations and add examples

2026-04-17 12:10:50 +02:00
parent e8b4db81ea
commit 0ca880b812
3 changed files with 158 additions and 243 deletions
@@ -0,0 +1,18 @@
 # Bascis
 GTS_SERVER_URL=https://domain.ltd
 GTS_ACCESS_TOKEN=YOUR_ACCESS_TOKEN
 # Performance Tuning
 MAX_POSTS_PER_RUN=25              # Posts per feed per run
 DELAY_BETWEEN_REQUESTS=2          # Seconds between API calls
 FETCH_INTERVAL=30m                # Interval zwischen Updates
 REQUEST_TIMEOUT=30                # Timeout für Anfragen an externe Server
 LOG_LEVEL=INFO                    # DEBUG for troubleshooting
 # Bot Identity
 #USER_AGENT=GTS-Federation-Bot/1.0 (Owner: @user@domain.ltd) # Optional
 # File Paths (usually don't need to change)
 RSS_URLS_FILE=/app/rss_feeds.txt
 DATABASE_PATH=/app/data/processed_urls.json
@@ -0,0 +1,41 @@
 # RSS Feeds
 # Tech & Homelab
 https://fosstodon.org/tags/homelab.rss?limit=25
 https://fosstodon.org/tags/docker.rss?limit=25
 https://fosstodon.org/tags/matrix.rss?limit=25
 https://fosstodon.org/tags/linux.rss?limit=25
 https://fosstodon.org/tags/foss.rss?limit=25
 https://fosstodon.org/tags/opensource.rss?limit=25
 https://mastodon.social/tags/opensource.rss?limit=25
 https://mastodon.social/tags/selfhosting.rss?limit=25
 https://mastodon.social/tags/technology.rss?limit=25
 https://social.tchncs.de/tags/linux.rss?limit=25
 https://social.tchncs.de/tags/synology.rss?limit=25
 # News & Politics
 https://mastodon.online/tags/nachrichten.rss?limit=25
 https://mastodon.social/tags/nachrichten.rss?limit=25
 https://norden.social/tags/nachrichten.rss?limit=25
 https://norden.social/tags/politik.rss?limit=25
 https://berlin.social/tags/politik.rss?limit=25
 https://social.bund.de/tags/digitalisierung.rss?limit=25
 https://ard.social/tags/tagesschau.rss?limit=25
 # Privacy & Security
 https://infosec.exchange/tags/security.rss?limit=25
 https://infosec.exchange/tags/privacy.rss?limit=25
 https://infosec.exchange/tags/cybersecurity.rss?limit=25
 https://infosec.exchange/tags/infosec.rss?limit=25
 https://infosec.exchange/tags/hacking.rss?limit=25
 https://norden.social/tags/datenschutz.rss?limit=25
 # Sport
 https://mastodon.social/tags/cycling.rss?limit=25
 # Spezialthemen & Community
 https://chaos.social/tags/ccc.rss?limit=25
 https://chaos.social/tags/republica.rss?limit=25
 https://mastodon.social/tags/fediverse.rss?limit=25
 https://hachyderm.io/tags/sysadmin.rss?limit=25
@@ -1,15 +1,4 @@
 #!/usr/bin/env python3
 """
 GTS-HolMirDas: RSS-based content discovery for GoToSocial
 Inspired by HolMirDas by @aliceif:
 - GitHub: https://github.com/aliceif/HolMirDas
 - Fediverse: @aliceif@mkultra.x27.one
 This GoToSocial adaptation extends the original RSS-to-ActivityPub concept
 with Docker deployment, multi-instance processing, and comprehensive monitoring.
 """
 import os
 import sys
 import time
@@ -18,264 +7,131 @@ import logging
 import requests
 import feedparser
 from datetime import timedelta
 from urllib.parse import quote_plus
 class GTSHolMirDas:
    def __init__(self):
        """Initialize the RSS fetcher with configuration"""
        self.config = {
-            "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"),
+            "server_url": os.getenv("GTS_SERVER_URL", "").rstrip('/'),
            "access_token": os.getenv("GTS_ACCESS_TOKEN", ""),
            "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")),
            "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")),
-            "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""),
+            "fetch_interval": os.getenv("FETCH_INTERVAL", "30m"),
-            "log_level": os.getenv("LOG_LEVEL", "INFO")
+            "log_level": os.getenv("LOG_LEVEL", "INFO"),
            "rss_urls_file": os.getenv("RSS_URLS_FILE", "/app/rss_feeds.txt"),
            "user_agent": os.getenv("USER_AGENT", "GTS-Federation-Bot/1.0 (+https://social.ztfr.eu)")
        }
        # Setup logging FIRST
        logging.basicConfig(
            level=getattr(logging, self.config["log_level"]),
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
        self.db_path = os.getenv("DATABASE_PATH", "/app/data/processed_urls.json")
        self.processed_urls, self.previous_instances = self.load_state()
-        # Load RSS URLs from file or environment
+        self.session = requests.Session()
-        rss_urls_file = os.getenv("RSS_URLS_FILE")
+        self.session.headers.update({
-        if rss_urls_file and os.path.exists(rss_urls_file):
+            "Authorization": f"Bearer {self.config['access_token']}",
-            # Load from file
+            "User-Agent": self.config['user_agent']
        })
    def parse_interval(self, interval_str):
        unit = interval_str[-1].lower()
        try:
            val = int(interval_str[:-1])
            return val * {'s': 1, 'm': 60, 'h': 3600}.get(unit, 60)
        except:
            return 1800
    def load_state(self):
        if os.path.exists(self.db_path):
            try:
-                with open(rss_urls_file, 'r') as f:
+                with open(self.db_path, 'r') as f:
                    self.config["rss_urls"] = [
                        line.split('#', 1)[0].strip() for line in f
                        if line.strip() and not line.strip().startswith('#')
                    ]
                self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}")
            except Exception as e:
                self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}")
                self.config["rss_urls"] = []
        else:
            # Fallback to environment variable
            self.config["rss_urls"] = [
                url.strip() for url in os.getenv("RSS_URLS", "").split(",") 
                if url.strip()
            ]
            if self.config["rss_urls"]:
                self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment")
        # Load processed URLs from persistent storage
        self.processed_urls_file = "/app/data/processed_urls.json"
        self.processed_urls = self.load_processed_urls()
        # Statistics tracking
        self.previous_instances = getattr(self, 'previous_instances', 0)
    def load_processed_urls(self):
        """Load previously processed URLs and instance count from file"""
        try:
            if os.path.exists(self.processed_urls_file):
                with open(self.processed_urls_file, 'r') as f:
                    data = json.load(f)
-                    # Load previous instance count for statistics
+                    return set(data.get('processed_urls', [])), data.get('previous_instances', 0)
-                    self.previous_instances = data.get('previous_instances', 0)
+            except Exception as e:
-                    return set(data.get('processed_urls', []))
+                self.logger.warning(f"DB konnte nicht geladen werden: {e}")
-        except Exception as e:
+        return set(), 0
            self.logger.warning(f"Could not load processed URLs: {e}")
        return set()
-    def save_processed_urls(self, current_instances=None):
+    def save_state(self, current_instances):
        """Save processed URLs and current instance count to file"""
        try:
-            os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True)
+            os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
-            data = {
+            url_list = list(self.processed_urls)[-5000:]
-                'processed_urls': list(self.processed_urls),
+            with open(self.db_path, 'w') as f:
-                'last_updated': time.time()
+                json.dump({'processed_urls': url_list, 'previous_instances': current_instances}, f, indent=2)
            }
            # Save current instance count for next run
            if current_instances is not None and current_instances != 'unknown':
                data['previous_instances'] = current_instances
            with open(self.processed_urls_file, 'w') as f:
                json.dump(data, f, indent=2)
        except Exception as e:
-            self.logger.error(f"Could not save processed URLs: {e}")
+            self.logger.error(f"Save error: {e}")
    def fetch_rss_urls(self, rss_url):
        """Fetch URLs from RSS feed"""
        try:
            self.logger.info(f"Fetching RSS feed: {rss_url}")
            # Parse RSS feed
            feed = feedparser.parse(rss_url)
            if feed.bozo:
                self.logger.warning(f"RSS feed may have issues: {rss_url}")
            # Extract URLs from entries
            urls = []
            for entry in feed.entries:
                if hasattr(entry, 'link'):
                    urls.append(entry.link)
            self.logger.info(f"Found {len(urls)} URLs in RSS feed")
            return urls
        except Exception as e:
            self.logger.error(f"Error fetching RSS feed {rss_url}: {e}")
            return []
    def lookup_post(self, post_url):
        """Look up a post URL using GTS search API"""
        try:
            # Prepare search API call
            search_url = f"{self.config['server_url']}/api/v2/search"
            params = {
                'q': post_url,
                'type': 'statuses',
                'resolve': 'true',
                'limit': 1
            }
            headers = {
                'Authorization': f'Bearer {self.config["access_token"]}',
                'Content-Type': 'application/json'
            }
            # Make API call
            response = requests.get(
                search_url,
                params=params,
                headers=headers,
                timeout=30
            )
            if response.status_code == 200:
                results = response.json()
                if results.get('statuses') or results.get('accounts'):
                    self.logger.info(f"Successfully looked up: {post_url}")
                    return True
                else:
                    self.logger.warning(f"No results for: {post_url}")
                    return False
            else:
                self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}")
                return False
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Error looking up {post_url}: {e}")
            return False
    def process_feeds(self):
-        """Process all configured RSS feeds"""
+        if not os.path.exists(self.config["rss_urls_file"]):
-        total_processed = 0
+            self.logger.error("RSS_URLS_FILE fehlt!")
        # Record start time for statistics
        self.start_time = time.time()
        # Ping healthcheck start
        self.ping_healthcheck("/start")
        try:
            for rss_url in self.config["rss_urls"]:
                if not rss_url.strip():
                    continue
                self.logger.info(f"Processing feed: {rss_url}")
                # Get URLs from RSS
                urls = self.fetch_rss_urls(rss_url)
                # Filter out already processed URLs
                new_urls = [url for url in urls if url not in self.processed_urls]
                if not new_urls:
                    self.logger.info("No new URLs to process")
                    continue
                # Rate limiting: max posts per run
                urls_to_process = new_urls[:self.config["max_posts_per_run"]]
                self.logger.info(f"Processing {len(urls_to_process)} new URLs")
                for url in urls_to_process:
                    if self.lookup_post(url):
                        self.processed_urls.add(url)
                        total_processed += 1
                    # Rate limiting: delay between requests
                    time.sleep(self.config["delay_between_requests"])
            # Calculate runtime
            end_time = time.time()
            runtime_seconds = end_time - self.start_time
            runtime_formatted = str(timedelta(seconds=int(runtime_seconds)))
            # Get current instance count
            try:
                instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", 
                                           headers={'Authorization': f'Bearer {self.config["access_token"]}'}, 
                                           timeout=10)
                if instance_info.status_code == 200:
                    current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown')
                else:
                    current_instances = 'unknown'
            except Exception as e:
                self.logger.error(f"Failed to get instance count: {e}")
                current_instances = 'unknown'
            # Calculate new instances (if we have previous data)
            new_instances = 'unknown'
            if self.previous_instances > 0 and current_instances != 'unknown':
                new_instances = current_instances - self.previous_instances
            # Print comprehensive statistics
            print(f"\n📊 GTS-HolMirDas Run Statistics:")
            print(f"   ⏱️  Runtime: {runtime_formatted}")
            print(f"   📄 Total posts processed: {total_processed}")
            print(f"   🌐 Current known instances: {current_instances}")
            if new_instances != 'unknown' and new_instances > 0:
                print(f"   ➕ New instances discovered: +{new_instances}")
            elif new_instances == 0:
                print(f"   ➕ New instances discovered: +0")
            print(f"   📡 RSS feeds processed: {len(self.config['rss_urls'])}")
            if runtime_seconds > 60:
                print(f"   ⚡ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}")
            self.save_processed_urls(current_instances)
            # Ping healthcheck success
            self.ping_healthcheck("")
        except Exception as e:
            self.logger.error(f"Error during processing: {e}")
            # Ping healthcheck failure
            self.ping_healthcheck("/fail")
            raise
    def ping_healthcheck(self, endpoint=""):
        """Ping healthchecks.io for monitoring"""
        if not self.config.get("healthcheck_url"):
            return
        with open(self.config["rss_urls_file"], 'r') as f:
            rss_urls = [l.split('#')[0].strip() for l in f if l.strip() and not l.strip().startswith('#')]
        total_new = 0
        start_time = time.time()
        for i, rss_url in enumerate(rss_urls, 1):
            self.logger.info(f"[{i}/{len(rss_urls)}] 📡 {rss_url}")
            try:
                resp = requests.get(rss_url, timeout=15, headers={"User-Agent": self.config['user_agent']})
                feed = feedparser.parse(resp.content)
                if not feed.entries:
                    continue
                new_links = [e.link for e in feed.entries if hasattr(e, 'link') and e.link not in self.processed_urls]
                if new_links:
                    for url in new_links[:self.config["max_posts_per_run"]]:
                        try:
                            # Timeout auf 30s erhöht, um "Read timed out" zu vermeiden
                            r = self.session.get(
                                f"{self.config['server_url']}/api/v2/search", 
                                params={'q': url, 'resolve': 'true', 'limit': 1}, 
                                timeout=30 
                            )
                            if r.status_code == 200:
                                self.processed_urls.add(url)
                                total_new += 1
                            elif r.status_code == 429:
                                self.logger.warning("Rate limit hit! Warte 10s...")
                                time.sleep(10)
                            time.sleep(self.config["delay_between_requests"])
                        except Exception as e:
                            self.logger.error(f"Fehler bei Post {url}: {e}")
                    # OPTIMIERUNG: Speichert nach jedem Feed, wenn neue Posts gefunden wurden
                    self.save_state(self.previous_instances)
            except Exception as e:
                self.logger.error(f"Fehler bei Feed {rss_url}: {e}")
        # Instanz-Statistiken am Ende des gesamten Runs
        try:
-            url = self.config["healthcheck_url"] + endpoint
+            ri = self.session.get(f"{self.config['server_url']}/api/v1/instance", timeout=10)
-            requests.get(url, timeout=10)
+            curr = ri.json().get('stats', {}).get('domain_count', 0)
-        except Exception as e:
+            diff = max(0, curr - self.previous_instances) if self.previous_instances else 0
-            self.logger.warning(f"Failed to ping healthcheck: {e}")
+        except:
            curr, diff = self.previous_instances, 0
-def main():
+        runtime = str(timedelta(seconds=int(time.time() - start_time)))
-    """Main entry point"""
+        print(f"\n✅ Run beendet | Zeit: {runtime} | Neue Posts: {total_new} | Instanzen: {curr} (+{diff})")
-    try:
+        self.save_state(curr)
        fetcher = GTSHolMirDas()
-        # Validate required config
+    def run_forever(self):
-        if not fetcher.config["access_token"]:
+        wait = self.parse_interval(self.config["fetch_interval"])
-            raise ValueError("GTS_ACCESS_TOKEN environment variable is required")
+        self.logger.info(f"GTS-Federator aktiv (Intervall: {self.config['fetch_interval']})")
-
+        while True:
-        fetcher.process_feeds()
+            self.process_feeds()
-
+            self.logger.info(f"Nächster Run in {self.config['fetch_interval']}...")
-    except Exception as e:
+            time.sleep(wait)
        logging.error(f"Fatal error: {e}")
        raise
 if __name__ == "__main__":
-    main()
+    bot = GTSHolMirDas()
    if not bot.config["access_token"]:
        sys.exit("Fehler: GTS_ACCESS_TOKEN fehlt!")
    bot.run_forever()