Browse Source

Not reloading already existing items in target XML

Stan Jansen 5 years ago
parent
commit
65858671f5
1 changed files with 43 additions and 33 deletions
  1. 43 33
      correspondent_rss.py

+ 43 - 33
correspondent_rss.py

@@ -7,11 +7,12 @@ from sys import argv
 from datetime import datetime
 import time
 import argparse
+import os.path
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-u", "--username", required=True, type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
-parser.add_argument("-p", "--password", required=True,type=str, help="Je wachtwoord voor de correspondent")
-parser.add_argument("-o", "--outfile", required=True, type=str, help="Volledig pad met filename waar de XML moet worden gezet")
+parser.add_argument("username", type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
+parser.add_argument("password", type=str, help="Je wachtwoord voor de correspondent")
+parser.add_argument("outfile", type=str, help="Volledig pad met filename waar de XML moet worden gezet")
 
 args = parser.parse_args()
 
@@ -22,7 +23,7 @@ outfile = args.outfile
 class Article:
     def __init__(self, card):
         self.card = card
-        self.full = self.get_full_article()
+        self.full = None
 
     @property
     def title(self):
@@ -32,35 +33,51 @@ class Article:
     def author(self):
         authors = self.card.find('div', class_='publication-card__names').find_all('div')
         return ", ".join([author.text.strip() for author in authors])
-    
+
     @property
     def link(self):
         return self.card.find('a', class_='publication-card__title')['href']
 
     @property
     def date(self):
+        if not self.full:
+            self.get_full_article()
         return self.full.find('time')['datetime']
 
     @property
     def rfc_822_date(self):
         return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
-    
+
     @property
     def summary(self):
+        if not self.full:
+            self.get_full_article()
         summary = self.full.find('div', class_='article-lead')
         if not summary:
             return None
         return summary.text.strip()
-    
+
     def get_full_article(self):
-        r = requests.get(self.link, cookies=self.cookies, headers=self.headers)        
+        r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
         while r.status_code != 200:
             print("Waiting for rate-limiter...")
             time.sleep(10)
             r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
-        return BeautifulSoup(r.text, 'html.parser')
+        self.full = BeautifulSoup(r.text, 'html.parser')
+
 
+# Load the existing file and gather the known URLS
+existing_xml = None
+known_links = []
+if os.path.isfile(outfile):
+    try:
+        existing_xml = etree.parse(outfile)
+        known_links = [link.text for link in existing_xml.iterfind('.//link')]
+    except:
+        pass
 
+
+# Log in to the website
 response = requests.post(
     url="https://decorrespondent.nl/api2/account/password-authenticate",
     headers={
@@ -85,31 +102,15 @@ Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4
 Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
 
 
-
+# Get the page that contains the latest articles
 req = requests.get("https://decorrespondent.nl/recent",
                     headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
                     cookies=cookies_reuse)
 soup = BeautifulSoup(req.text, 'html.parser')
 articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
 
-feed = {"version": "https://jsonfeed.org/version/1",
-        "title": "De Correspondent",
-        "home_page_url": "https://decorrespondent.nl",
-        "feed_url": "https://finetuned.nl/decorrespondent.json",
-        "items": []}
-
-for article in articles:
-    feed["items"].append({"id": id,
-                          "title": article.title,
-                          "link": article.link,
-                          "content_text": article.summary,
-                          "summary": article.summary,
-                          "date_published": article.date,
-                          "author": article.author,
-                          "rfc822_date": article.rfc_822_date})
 
 # Construct XML feed
-
 root = etree.Element("rss", version="2.0")
 channel = etree.Element("channel")
 title = etree.Element('title')
@@ -124,33 +125,42 @@ description = etree.Element('description')
 description.text = "Een dagelijks medicijn tegen de waan van de dag"
 channel.append(description)
 
-for article in feed['items']:
+for article in articles:
+    if article.link in known_links:
+        continue
+
     item = etree.Element('item')
     title = etree.Element('title')
-    title.text = article['title']
+    title.text = article.title
     item.append(title)
 
     description = etree.Element('description')
-    description.text = article['summary']
+    description.text = article.summary
     item.append(description)
 
     link = etree.Element('link')
-    link.text = article['link']
+    link.text = article.link
     item.append(link)
 
     guid = etree.Element('guid')
-    guid.text = article['link']
+    guid.text = article.link
     item.append(guid)
 
     pubDate = etree.Element('pubDate')
-    pubDate.text = article['rfc822_date']
+    pubDate.text = article.rfc_822_date
     item.append(pubDate)
 
     author = etree.Element('author')
-    author.text = article['author']
+    author.text = article.author
     item.append(author)
 
     channel.append(item)
+
+# Add previously loaded articles (if any)
+if existing_xml:
+    for item in existing_xml.iterfind('.//item'):
+        channel.append(item)
+
 root.append(channel)
 
 tree = etree.ElementTree(root)