diff --git a/Examples/.env b/Examples/.env new file mode 100755 index 0000000..01fd7e7 --- /dev/null +++ b/Examples/.env @@ -0,0 +1,18 @@ +# Bascis +GTS_SERVER_URL=https://domain.ltd +GTS_ACCESS_TOKEN=YOUR_ACCESS_TOKEN + +# Performance Tuning +MAX_POSTS_PER_RUN=25 # Posts per feed per run +DELAY_BETWEEN_REQUESTS=2 # Seconds between API calls +FETCH_INTERVAL=30m # Interval zwischen Updates +REQUEST_TIMEOUT=30 # Timeout für Anfragen an externe Server +LOG_LEVEL=INFO # DEBUG for troubleshooting + + +# Bot Identity +#USER_AGENT=GTS-Federation-Bot/1.0 (Owner: @user@domain.ltd) # Optional + +# File Paths (usually don't need to change) +RSS_URLS_FILE=/app/rss_feeds.txt +DATABASE_PATH=/app/data/processed_urls.json diff --git a/Examples/rss_feeds.txt b/Examples/rss_feeds.txt new file mode 100755 index 0000000..ca8c6a6 --- /dev/null +++ b/Examples/rss_feeds.txt @@ -0,0 +1,41 @@ +# RSS Feeds + +# Tech & Homelab +https://fosstodon.org/tags/homelab.rss?limit=25 +https://fosstodon.org/tags/docker.rss?limit=25 +https://fosstodon.org/tags/matrix.rss?limit=25 +https://fosstodon.org/tags/linux.rss?limit=25 +https://fosstodon.org/tags/foss.rss?limit=25 +https://fosstodon.org/tags/opensource.rss?limit=25 +https://mastodon.social/tags/opensource.rss?limit=25 +https://mastodon.social/tags/selfhosting.rss?limit=25 +https://mastodon.social/tags/technology.rss?limit=25 +https://social.tchncs.de/tags/linux.rss?limit=25 +https://social.tchncs.de/tags/synology.rss?limit=25 + +# News & Politics +https://mastodon.online/tags/nachrichten.rss?limit=25 +https://mastodon.social/tags/nachrichten.rss?limit=25 +https://norden.social/tags/nachrichten.rss?limit=25 +https://norden.social/tags/politik.rss?limit=25 +https://berlin.social/tags/politik.rss?limit=25 +https://social.bund.de/tags/digitalisierung.rss?limit=25 +https://ard.social/tags/tagesschau.rss?limit=25 + + +# Privacy & Security +https://infosec.exchange/tags/security.rss?limit=25 +https://infosec.exchange/tags/privacy.rss?limit=25 +https://infosec.exchange/tags/cybersecurity.rss?limit=25 +https://infosec.exchange/tags/infosec.rss?limit=25 +https://infosec.exchange/tags/hacking.rss?limit=25 +https://norden.social/tags/datenschutz.rss?limit=25 + +# Sport +https://mastodon.social/tags/cycling.rss?limit=25 + +# Spezialthemen & Community +https://chaos.social/tags/ccc.rss?limit=25 +https://chaos.social/tags/republica.rss?limit=25 +https://mastodon.social/tags/fediverse.rss?limit=25 +https://hachyderm.io/tags/sysadmin.rss?limit=25 \ No newline at end of file diff --git a/gts_holmirdas.py b/gts_holmirdas.py old mode 100644 new mode 100755 index 77d1eaf..2d8e1ae --- a/gts_holmirdas.py +++ b/gts_holmirdas.py @@ -1,15 +1,4 @@ #!/usr/bin/env python3 -""" -GTS-HolMirDas: RSS-based content discovery for GoToSocial - -Inspired by HolMirDas by @aliceif: -- GitHub: https://github.com/aliceif/HolMirDas -- Fediverse: @aliceif@mkultra.x27.one - -This GoToSocial adaptation extends the original RSS-to-ActivityPub concept -with Docker deployment, multi-instance processing, and comprehensive monitoring. -""" - import os import sys import time @@ -18,264 +7,131 @@ import logging import requests import feedparser from datetime import timedelta -from urllib.parse import quote_plus class GTSHolMirDas: def __init__(self): - """Initialize the RSS fetcher with configuration""" self.config = { - "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"), + "server_url": os.getenv("GTS_SERVER_URL", "").rstrip('/'), "access_token": os.getenv("GTS_ACCESS_TOKEN", ""), "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")), "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")), - "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""), - "log_level": os.getenv("LOG_LEVEL", "INFO") + "fetch_interval": os.getenv("FETCH_INTERVAL", "30m"), + "log_level": os.getenv("LOG_LEVEL", "INFO"), + "rss_urls_file": os.getenv("RSS_URLS_FILE", "/app/rss_feeds.txt"), + "user_agent": os.getenv("USER_AGENT", "GTS-Federation-Bot/1.0 (+https://social.ztfr.eu)") } - # Setup logging FIRST logging.basicConfig( level=getattr(logging, self.config["log_level"]), format='%(asctime)s - %(levelname)s - %(message)s' ) self.logger = logging.getLogger(__name__) + self.db_path = os.getenv("DATABASE_PATH", "/app/data/processed_urls.json") + self.processed_urls, self.previous_instances = self.load_state() - # Load RSS URLs from file or environment - rss_urls_file = os.getenv("RSS_URLS_FILE") - if rss_urls_file and os.path.exists(rss_urls_file): - # Load from file + self.session = requests.Session() + self.session.headers.update({ + "Authorization": f"Bearer {self.config['access_token']}", + "User-Agent": self.config['user_agent'] + }) + + def parse_interval(self, interval_str): + unit = interval_str[-1].lower() + try: + val = int(interval_str[:-1]) + return val * {'s': 1, 'm': 60, 'h': 3600}.get(unit, 60) + except: + return 1800 + + def load_state(self): + if os.path.exists(self.db_path): try: - with open(rss_urls_file, 'r') as f: - self.config["rss_urls"] = [ - line.split('#', 1)[0].strip() for line in f - if line.strip() and not line.strip().startswith('#') - ] - self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}") - except Exception as e: - self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}") - self.config["rss_urls"] = [] - else: - # Fallback to environment variable - self.config["rss_urls"] = [ - url.strip() for url in os.getenv("RSS_URLS", "").split(",") - if url.strip() - ] - if self.config["rss_urls"]: - self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment") - - # Load processed URLs from persistent storage - self.processed_urls_file = "/app/data/processed_urls.json" - self.processed_urls = self.load_processed_urls() - - # Statistics tracking - self.previous_instances = getattr(self, 'previous_instances', 0) - - def load_processed_urls(self): - """Load previously processed URLs and instance count from file""" - try: - if os.path.exists(self.processed_urls_file): - with open(self.processed_urls_file, 'r') as f: + with open(self.db_path, 'r') as f: data = json.load(f) - # Load previous instance count for statistics - self.previous_instances = data.get('previous_instances', 0) - return set(data.get('processed_urls', [])) - except Exception as e: - self.logger.warning(f"Could not load processed URLs: {e}") - - return set() + return set(data.get('processed_urls', [])), data.get('previous_instances', 0) + except Exception as e: + self.logger.warning(f"DB konnte nicht geladen werden: {e}") + return set(), 0 - def save_processed_urls(self, current_instances=None): - """Save processed URLs and current instance count to file""" + def save_state(self, current_instances): try: - os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True) - data = { - 'processed_urls': list(self.processed_urls), - 'last_updated': time.time() - } - # Save current instance count for next run - if current_instances is not None and current_instances != 'unknown': - data['previous_instances'] = current_instances - - with open(self.processed_urls_file, 'w') as f: - json.dump(data, f, indent=2) + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + url_list = list(self.processed_urls)[-5000:] + with open(self.db_path, 'w') as f: + json.dump({'processed_urls': url_list, 'previous_instances': current_instances}, f, indent=2) except Exception as e: - self.logger.error(f"Could not save processed URLs: {e}") - - def fetch_rss_urls(self, rss_url): - """Fetch URLs from RSS feed""" - try: - self.logger.info(f"Fetching RSS feed: {rss_url}") - - # Parse RSS feed - feed = feedparser.parse(rss_url) - - if feed.bozo: - self.logger.warning(f"RSS feed may have issues: {rss_url}") - - # Extract URLs from entries - urls = [] - for entry in feed.entries: - if hasattr(entry, 'link'): - urls.append(entry.link) - - self.logger.info(f"Found {len(urls)} URLs in RSS feed") - return urls - - except Exception as e: - self.logger.error(f"Error fetching RSS feed {rss_url}: {e}") - return [] - - def lookup_post(self, post_url): - """Look up a post URL using GTS search API""" - try: - # Prepare search API call - search_url = f"{self.config['server_url']}/api/v2/search" - params = { - 'q': post_url, - 'type': 'statuses', - 'resolve': 'true', - 'limit': 1 - } - headers = { - 'Authorization': f'Bearer {self.config["access_token"]}', - 'Content-Type': 'application/json' - } - - # Make API call - response = requests.get( - search_url, - params=params, - headers=headers, - timeout=30 - ) - - if response.status_code == 200: - results = response.json() - if results.get('statuses') or results.get('accounts'): - self.logger.info(f"Successfully looked up: {post_url}") - return True - else: - self.logger.warning(f"No results for: {post_url}") - return False - else: - self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}") - return False - - except requests.exceptions.RequestException as e: - self.logger.error(f"Error looking up {post_url}: {e}") - return False + self.logger.error(f"Save error: {e}") def process_feeds(self): - """Process all configured RSS feeds""" - total_processed = 0 - - # Record start time for statistics - self.start_time = time.time() - - # Ping healthcheck start - self.ping_healthcheck("/start") - - try: - for rss_url in self.config["rss_urls"]: - if not rss_url.strip(): - continue - - self.logger.info(f"Processing feed: {rss_url}") - - # Get URLs from RSS - urls = self.fetch_rss_urls(rss_url) - - # Filter out already processed URLs - new_urls = [url for url in urls if url not in self.processed_urls] - - if not new_urls: - self.logger.info("No new URLs to process") - continue - - # Rate limiting: max posts per run - urls_to_process = new_urls[:self.config["max_posts_per_run"]] - - self.logger.info(f"Processing {len(urls_to_process)} new URLs") - - for url in urls_to_process: - if self.lookup_post(url): - self.processed_urls.add(url) - total_processed += 1 - - # Rate limiting: delay between requests - time.sleep(self.config["delay_between_requests"]) - - # Calculate runtime - end_time = time.time() - runtime_seconds = end_time - self.start_time - runtime_formatted = str(timedelta(seconds=int(runtime_seconds))) - - # Get current instance count - try: - instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", - headers={'Authorization': f'Bearer {self.config["access_token"]}'}, - timeout=10) - if instance_info.status_code == 200: - current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown') - else: - current_instances = 'unknown' - except Exception as e: - self.logger.error(f"Failed to get instance count: {e}") - current_instances = 'unknown' - - # Calculate new instances (if we have previous data) - new_instances = 'unknown' - if self.previous_instances > 0 and current_instances != 'unknown': - new_instances = current_instances - self.previous_instances - - # Print comprehensive statistics - print(f"\n📊 GTS-HolMirDas Run Statistics:") - print(f" ⏱️ Runtime: {runtime_formatted}") - print(f" 📄 Total posts processed: {total_processed}") - print(f" 🌐 Current known instances: {current_instances}") - if new_instances != 'unknown' and new_instances > 0: - print(f" ➕ New instances discovered: +{new_instances}") - elif new_instances == 0: - print(f" ➕ New instances discovered: +0") - print(f" 📡 RSS feeds processed: {len(self.config['rss_urls'])}") - if runtime_seconds > 60: - print(f" ⚡ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}") - - self.save_processed_urls(current_instances) - - # Ping healthcheck success - self.ping_healthcheck("") - - except Exception as e: - self.logger.error(f"Error during processing: {e}") - # Ping healthcheck failure - self.ping_healthcheck("/fail") - raise - - def ping_healthcheck(self, endpoint=""): - """Ping healthchecks.io for monitoring""" - if not self.config.get("healthcheck_url"): + if not os.path.exists(self.config["rss_urls_file"]): + self.logger.error("RSS_URLS_FILE fehlt!") return + with open(self.config["rss_urls_file"], 'r') as f: + rss_urls = [l.split('#')[0].strip() for l in f if l.strip() and not l.strip().startswith('#')] + + total_new = 0 + start_time = time.time() + + for i, rss_url in enumerate(rss_urls, 1): + self.logger.info(f"[{i}/{len(rss_urls)}] 📡 {rss_url}") + try: + resp = requests.get(rss_url, timeout=15, headers={"User-Agent": self.config['user_agent']}) + feed = feedparser.parse(resp.content) + + if not feed.entries: + continue + + new_links = [e.link for e in feed.entries if hasattr(e, 'link') and e.link not in self.processed_urls] + + if new_links: + for url in new_links[:self.config["max_posts_per_run"]]: + try: + # Timeout auf 30s erhöht, um "Read timed out" zu vermeiden + r = self.session.get( + f"{self.config['server_url']}/api/v2/search", + params={'q': url, 'resolve': 'true', 'limit': 1}, + timeout=30 + ) + if r.status_code == 200: + self.processed_urls.add(url) + total_new += 1 + elif r.status_code == 429: + self.logger.warning("Rate limit hit! Warte 10s...") + time.sleep(10) + + time.sleep(self.config["delay_between_requests"]) + except Exception as e: + self.logger.error(f"Fehler bei Post {url}: {e}") + + # OPTIMIERUNG: Speichert nach jedem Feed, wenn neue Posts gefunden wurden + self.save_state(self.previous_instances) + + except Exception as e: + self.logger.error(f"Fehler bei Feed {rss_url}: {e}") + + # Instanz-Statistiken am Ende des gesamten Runs try: - url = self.config["healthcheck_url"] + endpoint - requests.get(url, timeout=10) - except Exception as e: - self.logger.warning(f"Failed to ping healthcheck: {e}") + ri = self.session.get(f"{self.config['server_url']}/api/v1/instance", timeout=10) + curr = ri.json().get('stats', {}).get('domain_count', 0) + diff = max(0, curr - self.previous_instances) if self.previous_instances else 0 + except: + curr, diff = self.previous_instances, 0 -def main(): - """Main entry point""" - try: - fetcher = GTSHolMirDas() + runtime = str(timedelta(seconds=int(time.time() - start_time))) + print(f"\n✅ Run beendet | Zeit: {runtime} | Neue Posts: {total_new} | Instanzen: {curr} (+{diff})") + self.save_state(curr) - # Validate required config - if not fetcher.config["access_token"]: - raise ValueError("GTS_ACCESS_TOKEN environment variable is required") - - fetcher.process_feeds() - - except Exception as e: - logging.error(f"Fatal error: {e}") - raise + def run_forever(self): + wait = self.parse_interval(self.config["fetch_interval"]) + self.logger.info(f"GTS-Federator aktiv (Intervall: {self.config['fetch_interval']})") + while True: + self.process_feeds() + self.logger.info(f"Nächster Run in {self.config['fetch_interval']}...") + time.sleep(wait) if __name__ == "__main__": - main() + bot = GTSHolMirDas() + if not bot.config["access_token"]: + sys.exit("Fehler: GTS_ACCESS_TOKEN fehlt!") + bot.run_forever() \ No newline at end of file