code optimizations and add examples
This commit is contained in:
Executable
+18
@@ -0,0 +1,18 @@
|
|||||||
|
# Bascis
|
||||||
|
GTS_SERVER_URL=https://domain.ltd
|
||||||
|
GTS_ACCESS_TOKEN=YOUR_ACCESS_TOKEN
|
||||||
|
|
||||||
|
# Performance Tuning
|
||||||
|
MAX_POSTS_PER_RUN=25 # Posts per feed per run
|
||||||
|
DELAY_BETWEEN_REQUESTS=2 # Seconds between API calls
|
||||||
|
FETCH_INTERVAL=30m # Interval zwischen Updates
|
||||||
|
REQUEST_TIMEOUT=30 # Timeout für Anfragen an externe Server
|
||||||
|
LOG_LEVEL=INFO # DEBUG for troubleshooting
|
||||||
|
|
||||||
|
|
||||||
|
# Bot Identity
|
||||||
|
#USER_AGENT=GTS-Federation-Bot/1.0 (Owner: @user@domain.ltd) # Optional
|
||||||
|
|
||||||
|
# File Paths (usually don't need to change)
|
||||||
|
RSS_URLS_FILE=/app/rss_feeds.txt
|
||||||
|
DATABASE_PATH=/app/data/processed_urls.json
|
||||||
Executable
+41
@@ -0,0 +1,41 @@
|
|||||||
|
# RSS Feeds
|
||||||
|
|
||||||
|
# Tech & Homelab
|
||||||
|
https://fosstodon.org/tags/homelab.rss?limit=25
|
||||||
|
https://fosstodon.org/tags/docker.rss?limit=25
|
||||||
|
https://fosstodon.org/tags/matrix.rss?limit=25
|
||||||
|
https://fosstodon.org/tags/linux.rss?limit=25
|
||||||
|
https://fosstodon.org/tags/foss.rss?limit=25
|
||||||
|
https://fosstodon.org/tags/opensource.rss?limit=25
|
||||||
|
https://mastodon.social/tags/opensource.rss?limit=25
|
||||||
|
https://mastodon.social/tags/selfhosting.rss?limit=25
|
||||||
|
https://mastodon.social/tags/technology.rss?limit=25
|
||||||
|
https://social.tchncs.de/tags/linux.rss?limit=25
|
||||||
|
https://social.tchncs.de/tags/synology.rss?limit=25
|
||||||
|
|
||||||
|
# News & Politics
|
||||||
|
https://mastodon.online/tags/nachrichten.rss?limit=25
|
||||||
|
https://mastodon.social/tags/nachrichten.rss?limit=25
|
||||||
|
https://norden.social/tags/nachrichten.rss?limit=25
|
||||||
|
https://norden.social/tags/politik.rss?limit=25
|
||||||
|
https://berlin.social/tags/politik.rss?limit=25
|
||||||
|
https://social.bund.de/tags/digitalisierung.rss?limit=25
|
||||||
|
https://ard.social/tags/tagesschau.rss?limit=25
|
||||||
|
|
||||||
|
|
||||||
|
# Privacy & Security
|
||||||
|
https://infosec.exchange/tags/security.rss?limit=25
|
||||||
|
https://infosec.exchange/tags/privacy.rss?limit=25
|
||||||
|
https://infosec.exchange/tags/cybersecurity.rss?limit=25
|
||||||
|
https://infosec.exchange/tags/infosec.rss?limit=25
|
||||||
|
https://infosec.exchange/tags/hacking.rss?limit=25
|
||||||
|
https://norden.social/tags/datenschutz.rss?limit=25
|
||||||
|
|
||||||
|
# Sport
|
||||||
|
https://mastodon.social/tags/cycling.rss?limit=25
|
||||||
|
|
||||||
|
# Spezialthemen & Community
|
||||||
|
https://chaos.social/tags/ccc.rss?limit=25
|
||||||
|
https://chaos.social/tags/republica.rss?limit=25
|
||||||
|
https://mastodon.social/tags/fediverse.rss?limit=25
|
||||||
|
https://hachyderm.io/tags/sysadmin.rss?limit=25
|
||||||
Regular → Executable
+99
-243
@@ -1,15 +1,4 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
|
||||||
GTS-HolMirDas: RSS-based content discovery for GoToSocial
|
|
||||||
|
|
||||||
Inspired by HolMirDas by @aliceif:
|
|
||||||
- GitHub: https://github.com/aliceif/HolMirDas
|
|
||||||
- Fediverse: @aliceif@mkultra.x27.one
|
|
||||||
|
|
||||||
This GoToSocial adaptation extends the original RSS-to-ActivityPub concept
|
|
||||||
with Docker deployment, multi-instance processing, and comprehensive monitoring.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
@@ -18,264 +7,131 @@ import logging
|
|||||||
import requests
|
import requests
|
||||||
import feedparser
|
import feedparser
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from urllib.parse import quote_plus
|
|
||||||
|
|
||||||
class GTSHolMirDas:
|
class GTSHolMirDas:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""Initialize the RSS fetcher with configuration"""
|
|
||||||
self.config = {
|
self.config = {
|
||||||
"server_url": os.getenv("GTS_SERVER_URL", "https://your-gts-instance"),
|
"server_url": os.getenv("GTS_SERVER_URL", "").rstrip('/'),
|
||||||
"access_token": os.getenv("GTS_ACCESS_TOKEN", ""),
|
"access_token": os.getenv("GTS_ACCESS_TOKEN", ""),
|
||||||
"max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")),
|
"max_posts_per_run": int(os.getenv("MAX_POSTS_PER_RUN", "25")),
|
||||||
"delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")),
|
"delay_between_requests": int(os.getenv("DELAY_BETWEEN_REQUESTS", "2")),
|
||||||
"healthcheck_url": os.getenv("HEALTHCHECK_URL", ""),
|
"fetch_interval": os.getenv("FETCH_INTERVAL", "30m"),
|
||||||
"log_level": os.getenv("LOG_LEVEL", "INFO")
|
"log_level": os.getenv("LOG_LEVEL", "INFO"),
|
||||||
|
"rss_urls_file": os.getenv("RSS_URLS_FILE", "/app/rss_feeds.txt"),
|
||||||
|
"user_agent": os.getenv("USER_AGENT", "GTS-Federation-Bot/1.0 (+https://social.ztfr.eu)")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Setup logging FIRST
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=getattr(logging, self.config["log_level"]),
|
level=getattr(logging, self.config["log_level"]),
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
)
|
)
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.db_path = os.getenv("DATABASE_PATH", "/app/data/processed_urls.json")
|
||||||
|
self.processed_urls, self.previous_instances = self.load_state()
|
||||||
|
|
||||||
# Load RSS URLs from file or environment
|
self.session = requests.Session()
|
||||||
rss_urls_file = os.getenv("RSS_URLS_FILE")
|
self.session.headers.update({
|
||||||
if rss_urls_file and os.path.exists(rss_urls_file):
|
"Authorization": f"Bearer {self.config['access_token']}",
|
||||||
# Load from file
|
"User-Agent": self.config['user_agent']
|
||||||
|
})
|
||||||
|
|
||||||
|
def parse_interval(self, interval_str):
|
||||||
|
unit = interval_str[-1].lower()
|
||||||
|
try:
|
||||||
|
val = int(interval_str[:-1])
|
||||||
|
return val * {'s': 1, 'm': 60, 'h': 3600}.get(unit, 60)
|
||||||
|
except:
|
||||||
|
return 1800
|
||||||
|
|
||||||
|
def load_state(self):
|
||||||
|
if os.path.exists(self.db_path):
|
||||||
try:
|
try:
|
||||||
with open(rss_urls_file, 'r') as f:
|
with open(self.db_path, 'r') as f:
|
||||||
self.config["rss_urls"] = [
|
|
||||||
line.split('#', 1)[0].strip() for line in f
|
|
||||||
if line.strip() and not line.strip().startswith('#')
|
|
||||||
]
|
|
||||||
self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from file: {rss_urls_file}")
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Could not load RSS URLs from file {rss_urls_file}: {e}")
|
|
||||||
self.config["rss_urls"] = []
|
|
||||||
else:
|
|
||||||
# Fallback to environment variable
|
|
||||||
self.config["rss_urls"] = [
|
|
||||||
url.strip() for url in os.getenv("RSS_URLS", "").split(",")
|
|
||||||
if url.strip()
|
|
||||||
]
|
|
||||||
if self.config["rss_urls"]:
|
|
||||||
self.logger.info(f"Loaded {len(self.config['rss_urls'])} RSS URLs from environment")
|
|
||||||
|
|
||||||
# Load processed URLs from persistent storage
|
|
||||||
self.processed_urls_file = "/app/data/processed_urls.json"
|
|
||||||
self.processed_urls = self.load_processed_urls()
|
|
||||||
|
|
||||||
# Statistics tracking
|
|
||||||
self.previous_instances = getattr(self, 'previous_instances', 0)
|
|
||||||
|
|
||||||
def load_processed_urls(self):
|
|
||||||
"""Load previously processed URLs and instance count from file"""
|
|
||||||
try:
|
|
||||||
if os.path.exists(self.processed_urls_file):
|
|
||||||
with open(self.processed_urls_file, 'r') as f:
|
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
# Load previous instance count for statistics
|
return set(data.get('processed_urls', [])), data.get('previous_instances', 0)
|
||||||
self.previous_instances = data.get('previous_instances', 0)
|
except Exception as e:
|
||||||
return set(data.get('processed_urls', []))
|
self.logger.warning(f"DB konnte nicht geladen werden: {e}")
|
||||||
except Exception as e:
|
return set(), 0
|
||||||
self.logger.warning(f"Could not load processed URLs: {e}")
|
|
||||||
|
|
||||||
return set()
|
def save_state(self, current_instances):
|
||||||
|
|
||||||
def save_processed_urls(self, current_instances=None):
|
|
||||||
"""Save processed URLs and current instance count to file"""
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(self.processed_urls_file), exist_ok=True)
|
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
|
||||||
data = {
|
url_list = list(self.processed_urls)[-5000:]
|
||||||
'processed_urls': list(self.processed_urls),
|
with open(self.db_path, 'w') as f:
|
||||||
'last_updated': time.time()
|
json.dump({'processed_urls': url_list, 'previous_instances': current_instances}, f, indent=2)
|
||||||
}
|
|
||||||
# Save current instance count for next run
|
|
||||||
if current_instances is not None and current_instances != 'unknown':
|
|
||||||
data['previous_instances'] = current_instances
|
|
||||||
|
|
||||||
with open(self.processed_urls_file, 'w') as f:
|
|
||||||
json.dump(data, f, indent=2)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Could not save processed URLs: {e}")
|
self.logger.error(f"Save error: {e}")
|
||||||
|
|
||||||
def fetch_rss_urls(self, rss_url):
|
|
||||||
"""Fetch URLs from RSS feed"""
|
|
||||||
try:
|
|
||||||
self.logger.info(f"Fetching RSS feed: {rss_url}")
|
|
||||||
|
|
||||||
# Parse RSS feed
|
|
||||||
feed = feedparser.parse(rss_url)
|
|
||||||
|
|
||||||
if feed.bozo:
|
|
||||||
self.logger.warning(f"RSS feed may have issues: {rss_url}")
|
|
||||||
|
|
||||||
# Extract URLs from entries
|
|
||||||
urls = []
|
|
||||||
for entry in feed.entries:
|
|
||||||
if hasattr(entry, 'link'):
|
|
||||||
urls.append(entry.link)
|
|
||||||
|
|
||||||
self.logger.info(f"Found {len(urls)} URLs in RSS feed")
|
|
||||||
return urls
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error fetching RSS feed {rss_url}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def lookup_post(self, post_url):
|
|
||||||
"""Look up a post URL using GTS search API"""
|
|
||||||
try:
|
|
||||||
# Prepare search API call
|
|
||||||
search_url = f"{self.config['server_url']}/api/v2/search"
|
|
||||||
params = {
|
|
||||||
'q': post_url,
|
|
||||||
'type': 'statuses',
|
|
||||||
'resolve': 'true',
|
|
||||||
'limit': 1
|
|
||||||
}
|
|
||||||
headers = {
|
|
||||||
'Authorization': f'Bearer {self.config["access_token"]}',
|
|
||||||
'Content-Type': 'application/json'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Make API call
|
|
||||||
response = requests.get(
|
|
||||||
search_url,
|
|
||||||
params=params,
|
|
||||||
headers=headers,
|
|
||||||
timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
results = response.json()
|
|
||||||
if results.get('statuses') or results.get('accounts'):
|
|
||||||
self.logger.info(f"Successfully looked up: {post_url}")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
self.logger.warning(f"No results for: {post_url}")
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
self.logger.error(f"API error {response.status_code} for {post_url}: {response.text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
self.logger.error(f"Error looking up {post_url}: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def process_feeds(self):
|
def process_feeds(self):
|
||||||
"""Process all configured RSS feeds"""
|
if not os.path.exists(self.config["rss_urls_file"]):
|
||||||
total_processed = 0
|
self.logger.error("RSS_URLS_FILE fehlt!")
|
||||||
|
|
||||||
# Record start time for statistics
|
|
||||||
self.start_time = time.time()
|
|
||||||
|
|
||||||
# Ping healthcheck start
|
|
||||||
self.ping_healthcheck("/start")
|
|
||||||
|
|
||||||
try:
|
|
||||||
for rss_url in self.config["rss_urls"]:
|
|
||||||
if not rss_url.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.logger.info(f"Processing feed: {rss_url}")
|
|
||||||
|
|
||||||
# Get URLs from RSS
|
|
||||||
urls = self.fetch_rss_urls(rss_url)
|
|
||||||
|
|
||||||
# Filter out already processed URLs
|
|
||||||
new_urls = [url for url in urls if url not in self.processed_urls]
|
|
||||||
|
|
||||||
if not new_urls:
|
|
||||||
self.logger.info("No new URLs to process")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Rate limiting: max posts per run
|
|
||||||
urls_to_process = new_urls[:self.config["max_posts_per_run"]]
|
|
||||||
|
|
||||||
self.logger.info(f"Processing {len(urls_to_process)} new URLs")
|
|
||||||
|
|
||||||
for url in urls_to_process:
|
|
||||||
if self.lookup_post(url):
|
|
||||||
self.processed_urls.add(url)
|
|
||||||
total_processed += 1
|
|
||||||
|
|
||||||
# Rate limiting: delay between requests
|
|
||||||
time.sleep(self.config["delay_between_requests"])
|
|
||||||
|
|
||||||
# Calculate runtime
|
|
||||||
end_time = time.time()
|
|
||||||
runtime_seconds = end_time - self.start_time
|
|
||||||
runtime_formatted = str(timedelta(seconds=int(runtime_seconds)))
|
|
||||||
|
|
||||||
# Get current instance count
|
|
||||||
try:
|
|
||||||
instance_info = requests.get(f"{self.config['server_url']}/api/v1/instance",
|
|
||||||
headers={'Authorization': f'Bearer {self.config["access_token"]}'},
|
|
||||||
timeout=10)
|
|
||||||
if instance_info.status_code == 200:
|
|
||||||
current_instances = instance_info.json().get('stats', {}).get('domain_count', 'unknown')
|
|
||||||
else:
|
|
||||||
current_instances = 'unknown'
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Failed to get instance count: {e}")
|
|
||||||
current_instances = 'unknown'
|
|
||||||
|
|
||||||
# Calculate new instances (if we have previous data)
|
|
||||||
new_instances = 'unknown'
|
|
||||||
if self.previous_instances > 0 and current_instances != 'unknown':
|
|
||||||
new_instances = current_instances - self.previous_instances
|
|
||||||
|
|
||||||
# Print comprehensive statistics
|
|
||||||
print(f"\n📊 GTS-HolMirDas Run Statistics:")
|
|
||||||
print(f" ⏱️ Runtime: {runtime_formatted}")
|
|
||||||
print(f" 📄 Total posts processed: {total_processed}")
|
|
||||||
print(f" 🌐 Current known instances: {current_instances}")
|
|
||||||
if new_instances != 'unknown' and new_instances > 0:
|
|
||||||
print(f" ➕ New instances discovered: +{new_instances}")
|
|
||||||
elif new_instances == 0:
|
|
||||||
print(f" ➕ New instances discovered: +0")
|
|
||||||
print(f" 📡 RSS feeds processed: {len(self.config['rss_urls'])}")
|
|
||||||
if runtime_seconds > 60:
|
|
||||||
print(f" ⚡ Posts per minute: {total_processed / (runtime_seconds / 60):.1f}")
|
|
||||||
|
|
||||||
self.save_processed_urls(current_instances)
|
|
||||||
|
|
||||||
# Ping healthcheck success
|
|
||||||
self.ping_healthcheck("")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error during processing: {e}")
|
|
||||||
# Ping healthcheck failure
|
|
||||||
self.ping_healthcheck("/fail")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def ping_healthcheck(self, endpoint=""):
|
|
||||||
"""Ping healthchecks.io for monitoring"""
|
|
||||||
if not self.config.get("healthcheck_url"):
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
with open(self.config["rss_urls_file"], 'r') as f:
|
||||||
|
rss_urls = [l.split('#')[0].strip() for l in f if l.strip() and not l.strip().startswith('#')]
|
||||||
|
|
||||||
|
total_new = 0
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
for i, rss_url in enumerate(rss_urls, 1):
|
||||||
|
self.logger.info(f"[{i}/{len(rss_urls)}] 📡 {rss_url}")
|
||||||
|
try:
|
||||||
|
resp = requests.get(rss_url, timeout=15, headers={"User-Agent": self.config['user_agent']})
|
||||||
|
feed = feedparser.parse(resp.content)
|
||||||
|
|
||||||
|
if not feed.entries:
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_links = [e.link for e in feed.entries if hasattr(e, 'link') and e.link not in self.processed_urls]
|
||||||
|
|
||||||
|
if new_links:
|
||||||
|
for url in new_links[:self.config["max_posts_per_run"]]:
|
||||||
|
try:
|
||||||
|
# Timeout auf 30s erhöht, um "Read timed out" zu vermeiden
|
||||||
|
r = self.session.get(
|
||||||
|
f"{self.config['server_url']}/api/v2/search",
|
||||||
|
params={'q': url, 'resolve': 'true', 'limit': 1},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if r.status_code == 200:
|
||||||
|
self.processed_urls.add(url)
|
||||||
|
total_new += 1
|
||||||
|
elif r.status_code == 429:
|
||||||
|
self.logger.warning("Rate limit hit! Warte 10s...")
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
time.sleep(self.config["delay_between_requests"])
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Fehler bei Post {url}: {e}")
|
||||||
|
|
||||||
|
# OPTIMIERUNG: Speichert nach jedem Feed, wenn neue Posts gefunden wurden
|
||||||
|
self.save_state(self.previous_instances)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Fehler bei Feed {rss_url}: {e}")
|
||||||
|
|
||||||
|
# Instanz-Statistiken am Ende des gesamten Runs
|
||||||
try:
|
try:
|
||||||
url = self.config["healthcheck_url"] + endpoint
|
ri = self.session.get(f"{self.config['server_url']}/api/v1/instance", timeout=10)
|
||||||
requests.get(url, timeout=10)
|
curr = ri.json().get('stats', {}).get('domain_count', 0)
|
||||||
except Exception as e:
|
diff = max(0, curr - self.previous_instances) if self.previous_instances else 0
|
||||||
self.logger.warning(f"Failed to ping healthcheck: {e}")
|
except:
|
||||||
|
curr, diff = self.previous_instances, 0
|
||||||
|
|
||||||
def main():
|
runtime = str(timedelta(seconds=int(time.time() - start_time)))
|
||||||
"""Main entry point"""
|
print(f"\n✅ Run beendet | Zeit: {runtime} | Neue Posts: {total_new} | Instanzen: {curr} (+{diff})")
|
||||||
try:
|
self.save_state(curr)
|
||||||
fetcher = GTSHolMirDas()
|
|
||||||
|
|
||||||
# Validate required config
|
def run_forever(self):
|
||||||
if not fetcher.config["access_token"]:
|
wait = self.parse_interval(self.config["fetch_interval"])
|
||||||
raise ValueError("GTS_ACCESS_TOKEN environment variable is required")
|
self.logger.info(f"GTS-Federator aktiv (Intervall: {self.config['fetch_interval']})")
|
||||||
|
while True:
|
||||||
fetcher.process_feeds()
|
self.process_feeds()
|
||||||
|
self.logger.info(f"Nächster Run in {self.config['fetch_interval']}...")
|
||||||
except Exception as e:
|
time.sleep(wait)
|
||||||
logging.error(f"Fatal error: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
bot = GTSHolMirDas()
|
||||||
|
if not bot.config["access_token"]:
|
||||||
|
sys.exit("Fehler: GTS_ACCESS_TOKEN fehlt!")
|
||||||
|
bot.run_forever()
|
||||||
Reference in New Issue
Block a user