5 years ago · 65858671f5
--- a/correspondent_rss.py
+++ b/correspondent_rss.py
@@ -7,11 +7,12 @@ from sys import argv
 
				 from datetime import datetime
			
 
				 import time
			
 
				 import argparse
			
 
				+import os.path
			
 
				 
			
 
				 parser = argparse.ArgumentParser()
			
 
				-parser.add_argument("-u", "--username", required=True, type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
			
 
				-parser.add_argument("-p", "--password", required=True,type=str, help="Je wachtwoord voor de correspondent")
			
 
				-parser.add_argument("-o", "--outfile", required=True, type=str, help="Volledig pad met filename waar de XML moet worden gezet")
			
 
				+parser.add_argument("username", type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
			
 
				+parser.add_argument("password", type=str, help="Je wachtwoord voor de correspondent")
			
 
				+parser.add_argument("outfile", type=str, help="Volledig pad met filename waar de XML moet worden gezet")
			
 
				 
			
 
				 args = parser.parse_args()
			
 
				 
			
@@ -22,7 +23,7 @@ outfile = args.outfile
 
				 class Article:
			
 
				     def __init__(self, card):
			
 
				         self.card = card
			
 
				-        self.full = self.get_full_article()
			
 
				+        self.full = None
			
 
				 
			
 
				     @property
			
 
				     def title(self):
			
@@ -32,35 +33,51 @@ class Article:
 
				     def author(self):
			
 
				         authors = self.card.find('div', class_='publication-card__names').find_all('div')
			
 
				         return ", ".join([author.text.strip() for author in authors])
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def link(self):
			
 
				         return self.card.find('a', class_='publication-card__title')['href']
			
 
				 
			
 
				     @property
			
 
				     def date(self):
			
 
				+        if not self.full:
			
 
				+            self.get_full_article()
			
 
				         return self.full.find('time')['datetime']
			
 
				 
			
 
				     @property
			
 
				     def rfc_822_date(self):
			
 
				         return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def summary(self):
			
 
				+        if not self.full:
			
 
				+            self.get_full_article()
			
 
				         summary = self.full.find('div', class_='article-lead')
			
 
				         if not summary:
			
 
				             return None
			
 
				         return summary.text.strip()
			
 
				-    
			
 
				+
			
 
				     def get_full_article(self):
			
 
				-        r = requests.get(self.link, cookies=self.cookies, headers=self.headers)        
			
 
				+        r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
			
 
				         while r.status_code != 200:
			
 
				             print("Waiting for rate-limiter...")
			
 
				             time.sleep(10)
			
 
				             r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
			
 
				-        return BeautifulSoup(r.text, 'html.parser')
			
 
				+        self.full = BeautifulSoup(r.text, 'html.parser')
			
 
				+
			
 
				 
			
 
				+# Load the existing file and gather the known URLS
			
 
				+existing_xml = None
			
 
				+known_links = []
			
 
				+if os.path.isfile(outfile):
			
 
				+    try:
			
 
				+        existing_xml = etree.parse(outfile)
			
 
				+        known_links = [link.text for link in existing_xml.iterfind('.//link')]
			
 
				+    except:
			
 
				+        pass
			
 
				 
			
 
				+
			
 
				+# Log in to the website
			
 
				 response = requests.post(
			
 
				     url="https://decorrespondent.nl/api2/account/password-authenticate",
			
 
				     headers={
			
@@ -85,31 +102,15 @@ Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4
 
				 Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
			
 
				 
			
 
				 
			
 
				-
			
 
				+# Get the page that contains the latest articles
			
 
				 req = requests.get("https://decorrespondent.nl/recent",
			
 
				                     headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
			
 
				                     cookies=cookies_reuse)
			
 
				 soup = BeautifulSoup(req.text, 'html.parser')
			
 
				 articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
			
 
				 
			
 
				-feed = {"version": "https://jsonfeed.org/version/1",
			
 
				-        "title": "De Correspondent",
			
 
				-        "home_page_url": "https://decorrespondent.nl",
			
 
				-        "feed_url": "https://finetuned.nl/decorrespondent.json",
			
 
				-        "items": []}
			
 
				-
			
 
				-for article in articles:
			
 
				-    feed["items"].append({"id": id,
			
 
				-                          "title": article.title,
			
 
				-                          "link": article.link,
			
 
				-                          "content_text": article.summary,
			
 
				-                          "summary": article.summary,
			
 
				-                          "date_published": article.date,
			
 
				-                          "author": article.author,
			
 
				-                          "rfc822_date": article.rfc_822_date})
			
 
				 
			
 
				 # Construct XML feed
			
 
				-
			
 
				 root = etree.Element("rss", version="2.0")
			
 
				 channel = etree.Element("channel")
			
 
				 title = etree.Element('title')
			
@@ -124,33 +125,42 @@ description = etree.Element('description')
 
				 description.text = "Een dagelijks medicijn tegen de waan van de dag"
			
 
				 channel.append(description)
			
 
				 
			
 
				-for article in feed['items']:
			
 
				+for article in articles:
			
 
				+    if article.link in known_links:
			
 
				+        continue
			
 
				+
			
 
				     item = etree.Element('item')
			
 
				     title = etree.Element('title')
			
 
				-    title.text = article['title']
			
 
				+    title.text = article.title
			
 
				     item.append(title)
			
 
				 
			
 
				     description = etree.Element('description')
			
 
				-    description.text = article['summary']
			
 
				+    description.text = article.summary
			
 
				     item.append(description)
			
 
				 
			
 
				     link = etree.Element('link')
			
 
				-    link.text = article['link']
			
 
				+    link.text = article.link
			
 
				     item.append(link)
			
 
				 
			
 
				     guid = etree.Element('guid')
			
 
				-    guid.text = article['link']
			
 
				+    guid.text = article.link
			
 
				     item.append(guid)
			
 
				 
			
 
				     pubDate = etree.Element('pubDate')
			
 
				-    pubDate.text = article['rfc822_date']
			
 
				+    pubDate.text = article.rfc_822_date
			
 
				     item.append(pubDate)
			
 
				 
			
 
				     author = etree.Element('author')
			
 
				-    author.text = article['author']
			
 
				+    author.text = article.author
			
 
				     item.append(author)
			
 
				 
			
 
				     channel.append(item)
			
 
				+
			
 
				+# Add previously loaded articles (if any)
			
 
				+if existing_xml:
			
 
				+    for item in existing_xml.iterfind('.//item'):
			
 
				+        channel.append(item)
			
 
				+
			
 
				 root.append(channel)
			
 
				 
			
 
				 tree = etree.ElementTree(root)