commit dc9f0ed7b89bf35feffea72e68544291f6595c8b Author: Dome Date: Fri Apr 17 09:17:10 2026 +0200 initial form upload diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..57540bd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +# Dockerfile +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Copy and install requirements +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Create data directory +RUN mkdir -p /app/data + +# Create non-root user +RUN useradd -r -u 1000 holmirdas + +# Set ownership +RUN chown -R holmirdas:holmirdas /app + +# Switch to non-root user +USER holmirdas + +# Default command (will be overridden by docker-compose) +CMD ["python", "gts_holmirdas.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..450a765 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Matthias + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b23f68e --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +# GTS-HolMirDas šŸš€ + +RSS-based content discovery for [GoToSocial](https://codeberg.org/superseriousbusiness/gotosocial) instances. + +Automatically discovers and federates content from RSS feeds across the Fediverse, helping small GoToSocial instances populate their federated timeline without relying on traditional relays. + +Inspired by the original [HolMirDas](https://github.com/aliceif/HolMirDas) by [@aliceif](https://mkultra.x27.one/@aliceif), adapted for GoToSocial with enhanced Docker deployment and multi-instance processing. + +## ✨ Key Features + +- **šŸ“” Multi-Instance Discovery** - Fetches content from configurable RSS feeds across Fediverse instances +- **⚔ Performance Scaling** - 20-100 posts per feed with URL parameters (`?limit=100`) +- **🐳 Production Ready** - Docker deployment, environment-based config, health monitoring +- **šŸ“Š Comprehensive Stats** - Runtime metrics, federation growth, performance tracking +- **šŸ”§ Zero Maintenance** - Runs automatically every hour with duplicate detection + +## šŸš€ Quick Start + +```bash +# Clone the repository +git clone https://git.klein.ruhr/matthias/gts-holmirdas +cd gts-holmirdas + +# Copy configuration templates +cp .env.example .env +cp rss_feeds.example.txt rss_feeds.txt + +# Edit configuration +nano .env # Add your GTS credentials +nano rss_feeds.txt # Customize RSS feeds + +# Deploy +docker compose up -d + +# Monitor +docker compose logs -f +``` + +## šŸ“ˆ Performance at Scale + +**Real Production Data:** +``` +šŸ“Š Runtime: 8:42 | 487 posts processed | 3,150+ instances discovered +⚔ 56 posts/minute | 102 RSS feeds | +45 new instances per run +šŸ’¾ Resource usage: ~450MB RAM total (GoToSocial + tools) +``` + +**Scaling Options:** +- **Conservative:** 20 posts/feed (~100 posts/run) +- **Balanced:** 50 posts/feed (~300 posts/run) +- **Aggressive:** 100 posts/feed (~600 posts/run) + +## šŸ› ļø Configuration Essentials + +### Environment Variables (.env) +```bash +# Required +GTS_SERVER_URL=https://your-gts-instance.tld +GTS_ACCESS_TOKEN=your_gts_access_token + +# Performance Tuning +MAX_POSTS_PER_RUN=25 # Posts per feed per run +DELAY_BETWEEN_REQUESTS=1 # Seconds between API calls +LOG_LEVEL=INFO # DEBUG for troubleshooting +``` + +### RSS Feeds (rss_feeds.txt) +```bash +# Use URL parameters to scale performance +https://mastodon.social/tags/homelab.rss?limit=50 +https://fosstodon.org/tags/selfhosting.rss?limit=100 +https://infosec.exchange/tags/security.rss?limit=75 +``` + +### GoToSocial Access Token +1. Login to your GoToSocial instance +2. Settings → Applications → Create new application +3. Required scopes: `read`, `read:search`, `read:statuses` +4. Copy access token to `.env` file + +## šŸ“– Complete Documentation + +For detailed information, visit our **[Wiki](https://git.klein.ruhr/matthias/gts-holmirdas/wiki)**: + +- **[šŸ“‹ Installation Guide](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Installation-Guide.-)** - Detailed setup, Docker configuration, deployment options +- **[šŸ“ˆ Performance & Scaling](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Performance-%26-Scaling)** - Optimization tables, scaling strategies, resource planning +- **[šŸ› ļø Troubleshooting](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Troubleshooting)** - Common issues, Docker problems, debugging guide +- **[āš™ļø Advanced Configuration](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Advanced-Configuration)** - Environment variables, RSS strategies, production tips +- **[šŸ“Š Monitoring & Stats](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/Monitoring-%26-Stats)** - Understanding output, health monitoring, metrics +- **[ā“ FAQ](https://git.klein.ruhr/matthias/gts-holmirdas/wiki/FAQ+-+Frequently+Asked+Questions.-)** - Common questions and answers + +## šŸ¤ Community & Support + +- **[Contributing Guide](Contributing)** - Development setup and contribution guidelines *(coming soon)* +- **Issues**: [Report bugs or request features](https://git.klein.ruhr/matthias/gts-holmirdas/issues) +- **Contact**: [@matthias@me.klein.ruhr](https://me.klein.ruhr/@matthias) on the Fediverse + +## šŸ”— Related Projects + +- **[FediFetcher](https://github.com/nanos/fedifetcher)** - Fetches missing replies and posts +- **[GoToSocial](https://github.com/superseriousbusiness/gotosocial)** - Lightweight ActivityPub server +- **[slurp](https://github.com/VyrCossont/slurp)** - Import posts from other instances + +## šŸ“„ License + +MIT License - see [LICENSE](LICENSE) file for details. + +## šŸ™ Acknowledgments + +- Inspired by [HolMirDas](https://github.com/aliceif/HolMirDas) by [@aliceif](https://mkultra.x27.one/@aliceif) +- Built for the GoToSocial community +- RSS-to-ActivityPub federation approach \ No newline at end of file diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..c64bc46 --- /dev/null +++ b/compose.yml @@ -0,0 +1,32 @@ +services: + gts-holmirdas: + build: . + container_name: gts-holmirdas + restart: unless-stopped + + env_file: + - .env + + volumes: + - ./data:/app/data + - ./gts_holmirdas.py:/app/gts_holmirdas.py:ro + - ./rss_feeds.txt:/app/rss_feeds.txt:ro + + # Run every 3 hours (balanced frequency) + entrypoint: > + sh -c " + while true; do + echo 'Starting GTS-HolMirDas run...' + python gts_holmirdas.py + echo 'GTS-HolMirDas run completed. Sleeping for 1 hour...' + sleep 3600 + done + " + + # Resource limits + deploy: + resources: + limits: + memory: 512M + reservations: + memory: 256M diff --git a/gts_holmirdas.py b/gts_holmirdas.py new file mode 100644 index 0000000..77d1eaf --- /dev/null +++ b/gts_holmirdas.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +GTS-HolMirDas: RSS-based content discovery for GoToSocial + +Inspired by HolMirDas by @aliceif: +- GitHub: https://github.com/aliceif/HolMirDas +- Fediverse: @aliceif@mkultra.x27.one + +This GoToSocial adaptation extends the original RSS-to-ActivityPub concept +with Docker deployment, multi-instance processing, and comprehensive monitoring. +""" + +import os +import sys +import time +import json +import logging +import requests +import feedparser +from datetime import timedelta +from urllib.parse import quote_plus + +class GTSHolMirDas: + def __init__(self): + """Initialize the RSS fetcher with configuration""" + self.config = { + "server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"), + "access_token": os.getenv("GTS_ACCESS_TOKEN", ""), + "max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")), + "delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")), + "healthcheck_url": os.getenv("HEALTHCHECK_URL", ""), + "log_level": os.getenv("LOG_LEVEL", "INFO") + } + + # Setup logging FIRST + logging.basicConfig( + level=getattr(logging, self.config["log_level"]), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # Load RSS URLs from file or environment + rss_urls_file = os.getenv("RSS_URLS_FILE") + if rss_urls_file and os.path.exists(rss_urls_file): + # Load from file + try: + with open(rss_urls_file, 'r') as f: + self.config["rss_urls"] = [ + line.split('#', 1)[0].strip() for line in f + if line.strip() and not line.strip().startswith('#') + ] + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}") + except Exception as e: + self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}") + self.config["rss_urls"] = [] + else: + # Fallback to environment variable + self.config["rss_urls"] = [ + url.strip() for url in os.getenv("RSS_URLS", "").split(",") + if url.strip() + ] + if self.config["rss_urls"]: + self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment") + + # Load processed URLs from persistent storage + self.processed_urls_file = "/app/data/processed_urls.json" + self.processed_urls = self.load_processed_urls() + + # Statistics tracking + self.previous_instances = getattr(self, 'previous_instances', 0) + + def load_processed_urls(self): + """Load previously processed URLs and instance count from file""" + try: + if os.path.exists(self.processed_urls_file): + with open(self.processed_urls_file, 'r') as f: + data = json.load(f) + # Load previous instance count for statistics + self.previous_instances = data.get('previous_instances', 0) + return set(data.get('processed_urls', [])) + except Exception as e: + self.logger.warning(f"Could not load processed URLs: {e}") + + return set() + + def save_processed_urls(self, current_instances=None): + """Save processed URLs and current instance count to file""" + try: + os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True) + data = { + 'processed_urls': list(self.processed_urls), + 'last_updated': time.time() + } + # Save current instance count for next run + if current_instances is not None and current_instances != 'unknown': + data['previous_instances'] = current_instances + + with open(self.processed_urls_file, 'w') as f: + json.dump(data, f, indent=2) + except Exception as e: + self.logger.error(f"Could not save processed URLs: {e}") + + def fetch_rss_urls(self, rss_url): + """Fetch URLs from RSS feed""" + try: + self.logger.info(f"Fetching RSS feed: {rss_url}") + + # Parse RSS feed + feed = feedparser.parse(rss_url) + + if feed.bozo: + self.logger.warning(f"RSS feed may have issues: {rss_url}") + + # Extract URLs from entries + urls = [] + for entry in feed.entries: + if hasattr(entry, 'link'): + urls.append(entry.link) + + self.logger.info(f"Found {len(urls)} URLs in RSS feed") + return urls + + except Exception as e: + self.logger.error(f"Error fetching RSS feed {rss_url}: {e}") + return [] + + def lookup_post(self, post_url): + """Look up a post URL using GTS search API""" + try: + # Prepare search API call + search_url = f"{self.config['server_url']}/api/v2/search" + params = { + 'q': post_url, + 'type': 'statuses', + 'resolve': 'true', + 'limit': 1 + } + headers = { + 'Authorization': f'Bearer {self.config["access_token"]}', + 'Content-Type': 'application/json' + } + + # Make API call + response = requests.get( + search_url, + params=params, + headers=headers, + timeout=30 + ) + + if response.status_code == 200: + results = response.json() + if results.get('statuses') or results.get('accounts'): + self.logger.info(f"Successfully looked up: {post_url}") + return True + else: + self.logger.warning(f"No results for: {post_url}") + return False + else: + self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}") + return False + + except requests.exceptions.RequestException as e: + self.logger.error(f"Error looking up {post_url}: {e}") + return False + + def process_feeds(self): + """Process all configured RSS feeds""" + total_processed = 0 + + # Record start time for statistics + self.start_time = time.time() + + # Ping healthcheck start + self.ping_healthcheck("/start") + + try: + for rss_url in self.config["rss_urls"]: + if not rss_url.strip(): + continue + + self.logger.info(f"Processing feed: {rss_url}") + + # Get URLs from RSS + urls = self.fetch_rss_urls(rss_url) + + # Filter out already processed URLs + new_urls = [url for url in urls if url not in self.processed_urls] + + if not new_urls: + self.logger.info("No new URLs to process") + continue + + # Rate limiting: max posts per run + urls_to_process = new_urls[:self.config["max_posts_per_run"]] + + self.logger.info(f"Processing {len(urls_to_process)} new URLs") + + for url in urls_to_process: + if self.lookup_post(url): + self.processed_urls.add(url) + total_processed += 1 + + # Rate limiting: delay between requests + time.sleep(self.config["delay_between_requests"]) + + # Calculate runtime + end_time = time.time() + runtime_seconds = end_time - self.start_time + runtime_formatted = str(timedelta(seconds=int(runtime_seconds))) + + # Get current instance count + try: + instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance", + headers={'Authorization': f'Bearer {self.config["access_token"]}'}, + timeout=10) + if instance_info.status_code == 200: + current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown') + else: + current_instances = 'unknown' + except Exception as e: + self.logger.error(f"Failed to get instance count: {e}") + current_instances = 'unknown' + + # Calculate new instances (if we have previous data) + new_instances = 'unknown' + if self.previous_instances > 0 and current_instances != 'unknown': + new_instances = current_instances - self.previous_instances + + # Print comprehensive statistics + print(f"\nšŸ“Š GTS-HolMirDas Run Statistics:") + print(f" ā±ļø Runtime: {runtime_formatted}") + print(f" šŸ“„ Total posts processed: {total_processed}") + print(f" 🌐 Current known instances: {current_instances}") + if new_instances != 'unknown' and new_instances > 0: + print(f" āž• New instances discovered: +{new_instances}") + elif new_instances == 0: + print(f" āž• New instances discovered: +0") + print(f" šŸ“” RSS feeds processed: {len(self.config['rss_urls'])}") + if runtime_seconds > 60: + print(f" ⚔ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}") + + self.save_processed_urls(current_instances) + + # Ping healthcheck success + self.ping_healthcheck("") + + except Exception as e: + self.logger.error(f"Error during processing: {e}") + # Ping healthcheck failure + self.ping_healthcheck("/fail") + raise + + def ping_healthcheck(self, endpoint=""): + """Ping healthchecks.io for monitoring""" + if not self.config.get("healthcheck_url"): + return + + try: + url = self.config["healthcheck_url"] + endpoint + requests.get(url, timeout=10) + except Exception as e: + self.logger.warning(f"Failed to ping healthcheck: {e}") + +def main(): + """Main entry point""" + try: + fetcher = GTSHolMirDas() + + # Validate required config + if not fetcher.config["access_token"]: + raise ValueError("GTS_ACCESS_TOKEN environment variable is required") + + fetcher.process_feeds() + + except Exception as e: + logging.error(f"Fatal error: {e}") + raise + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f2d32a5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.31.0 +feedparser==6.0.10 +urllib3==2.0.7 diff --git a/rss_feeds.example.txt b/rss_feeds.example.txt new file mode 100644 index 0000000..d532f51 --- /dev/null +++ b/rss_feeds.example.txt @@ -0,0 +1,17 @@ +# Example RSS feeds - customize for your interests + +# Add ?limit=X parameter to increase posts per feed (default: 20, max: 100) +# Higher limits = more content discovery, but longer processing time +# Performance tip: Start with limit=50, then increase to 100 if needed + +# homelab (up to 100 posts per feed) +https://mastodon.social/tags/homelab.rss # 20 posts/feed (default) +https://fosstodon.org/tags/homelab.rss?limit=50 # 50 posts/feed + +# selfhosting (up to 100 posts per feed) +https://mastodon.social/tags/selfhosting.rss?limit=100 # 100 posts/feed +https://infosec.exchange/tags/selfhosting.rss?limit=100 # 100 posts/feed + +# docker (up to 100 posts per feed) +https://social.tchncs.de/tags/docker.rss?limit=100 # 100 posts/feed +https://fosstodon.org/tags/docker.rss?limit=100 # 100 posts/feed