Browse Source

Eerste versie van Correspondent RSS creator

Stan Janssen 5 years ago
commit
8e77bb77d7
3 changed files with 188 additions and 0 deletions
  1. 24 0
      README.md
  2. 160 0
      correspondent_rss.py
  3. 4 0
      requirements.txt

+ 24 - 0
README.md

@@ -0,0 +1,24 @@
+# De Correspondent RSS Feed maker
+
+Met dit script kun je een RSS-feed maken van de artikelen op De Correspondent.
+
+Je hebt hiervoor Python 3.7 nodig.
+
+## Installatie
+
+```
+git clone https://gitlab.com/finetuned/correspondent-rss-feed
+cd correspondent-rss-feed
+python3.7 -m venv python_env
+./python_env/bin/pip3 install -r requirements.txt
+```
+
+## Gebruik
+
+Je roept het script aan met de parameters email, wachtwoord en de bestandsnaam van het XML-bestand wat je wilt genereren. 
+
+```
+./python_env/bin/python3 correspondent_rss.py mijn@email.adres MijnWachtwoord rss.xml
+```
+
+Evenuteel als crontab in te stellen voor automatisch gebruik. Laat de file ergens plaatsen waar je vanaf het internet bij kan, en dan kun je de RSS-feed gebruiken in je RSS-lezer.

+ 160 - 0
correspondent_rss.py

@@ -0,0 +1,160 @@
+#!/usr/bin/env python3.7
+
+import requests
+from bs4 import BeautifulSoup
+from lxml import etree
+from sys import argv
+from datetime import datetime
+import html
+import time
+
+
+if len(argv) != 4:
+    print("Usage: {argv[0]} USERNAME PASSWORD OUTFILE")
+    quit(1)
+
+emailaddr = argv[1]
+password = argv[2]
+outfile = argv[3]
+
+class Article:
+    def __init__(self, card):
+        self.card = card
+        self.full = self.get_full_article()
+
+    @property
+    def title(self):
+        return html.unescape(self.card.find('a', class_='publication-card__title').text.strip())
+
+    @property
+    def author(self):
+        authors = self.card.find('div', class_='publication-card__names').find_all('div')
+        return ", ".join([author.text.strip() for author in authors])
+    
+    @property
+    def link(self):
+        return self.card.find('a', class_='publication-card__title')['href']
+
+    @property
+    def date(self):
+        return self.full.find('time')['datetime']
+
+    @property
+    def rfc_822_date(self):
+        return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
+    
+    @property
+    def summary(self):
+        summary = self.full.find('div', class_='article-lead')
+        if not summary:
+            return None
+        return summary.text.strip()
+    
+    def get_full_article(self):
+        r = requests.get(self.link, cookies=self.cookies, headers=self.headers)        
+        while r.status_code != 200:
+            print("Waiting for rate-limiter...")
+            time.sleep(10)
+            r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
+        return BeautifulSoup(r.text, 'html.parser')
+
+
+response = requests.post(
+    url="https://decorrespondent.nl/api2/account/password-authenticate",
+    headers={
+        "Content-Type": "application/json",
+        "Origin": "https://decorrespondent.nl",
+        "Pragma": "no-cache",
+        "Accept": "application/json",
+        "Cache-Control": "no-cache",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15",
+        "Referer": "https://decorrespondent.nl/inloggen",
+        "X-Requested-With": "XMLHttpRequest",
+    },
+    json={"emailAddress": emailaddr, "password": password},
+    stream = True
+)
+cookies = response.raw.headers["Set-Cookie"]
+cookies_list = cookies.split("; ")
+cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item}
+cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
+
+Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}
+Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
+
+
+
+req = requests.get("https://decorrespondent.nl/recent",
+                    headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
+                    cookies=cookies_reuse)
+soup = BeautifulSoup(req.text, 'html.parser')
+articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
+
+feed = {"version": "https://jsonfeed.org/version/1",
+        "title": "De Correspondent",
+        "home_page_url": "https://decorrespondent.nl",
+        "feed_url": "https://finetuned.nl/decorrespondent.json",
+        "items": []}
+
+for article in articles:
+    feed["items"].append({"id": id,
+                          "title": article.title,
+                          "link": article.link,
+                          "content_text": article.summary,
+                          "summary": article.summary,
+                          "date_published": article.date,
+                          "author": article.author,
+                          "rfc822_date": article.rfc_822_date})
+
+# Construct XML feed
+
+root = etree.Element("rss", version="2.0")
+channel = etree.Element("channel")
+title = etree.Element('title')
+title.text = "De Correspondent"
+channel.append(title)
+
+link = etree.Element('link')
+link.text = "https://decorrespondent.nl"
+channel.append(link)
+
+description = etree.Element('description')
+description.text = "Een dagelijks medicijn tegen de waan van de dag"
+channel.append(description)
+
+for article in feed['items']:
+    item = etree.Element('item')
+    title = etree.Element('title')
+    title.text = article['title']
+    item.append(title)
+
+    description = etree.Element('description')
+    description.text = article['summary']
+    item.append(description)
+
+    link = etree.Element('link')
+    link.text = article['link']
+    item.append(link)
+
+    guid = etree.Element('guid')
+    guid.text = article['link']
+    item.append(guid)
+
+    pubDate = etree.Element('pubDate')
+    pubDate.text = article['rfc822_date']
+    item.append(pubDate)
+
+    author = etree.Element('author')
+    author.text = article['author']
+    item.append(author)
+
+    channel.append(item)
+root.append(channel)
+
+tree = etree.ElementTree(root)
+
+with open(outfile, "wb") as file:
+    tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')
+
+
+

+ 4 - 0
requirements.txt

@@ -0,0 +1,4 @@
+bs4
+requests
+lxml
+