|
@@ -7,11 +7,12 @@ from sys import argv
|
|
|
from datetime import datetime
|
|
|
import time
|
|
|
import argparse
|
|
|
+import os.path
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
-parser.add_argument("-u", "--username", required=True, type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
|
|
|
-parser.add_argument("-p", "--password", required=True,type=str, help="Je wachtwoord voor de correspondent")
|
|
|
-parser.add_argument("-o", "--outfile", required=True, type=str, help="Volledig pad met filename waar de XML moet worden gezet")
|
|
|
+parser.add_argument("username", type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
|
|
|
+parser.add_argument("password", type=str, help="Je wachtwoord voor de correspondent")
|
|
|
+parser.add_argument("outfile", type=str, help="Volledig pad met filename waar de XML moet worden gezet")
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
@@ -22,7 +23,7 @@ outfile = args.outfile
|
|
|
class Article:
|
|
|
def __init__(self, card):
|
|
|
self.card = card
|
|
|
- self.full = self.get_full_article()
|
|
|
+ self.full = None
|
|
|
|
|
|
@property
|
|
|
def title(self):
|
|
@@ -32,35 +33,51 @@ class Article:
|
|
|
def author(self):
|
|
|
authors = self.card.find('div', class_='publication-card__names').find_all('div')
|
|
|
return ", ".join([author.text.strip() for author in authors])
|
|
|
-
|
|
|
+
|
|
|
@property
|
|
|
def link(self):
|
|
|
return self.card.find('a', class_='publication-card__title')['href']
|
|
|
|
|
|
@property
|
|
|
def date(self):
|
|
|
+ if not self.full:
|
|
|
+ self.get_full_article()
|
|
|
return self.full.find('time')['datetime']
|
|
|
|
|
|
@property
|
|
|
def rfc_822_date(self):
|
|
|
return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
|
|
|
-
|
|
|
+
|
|
|
@property
|
|
|
def summary(self):
|
|
|
+ if not self.full:
|
|
|
+ self.get_full_article()
|
|
|
summary = self.full.find('div', class_='article-lead')
|
|
|
if not summary:
|
|
|
return None
|
|
|
return summary.text.strip()
|
|
|
-
|
|
|
+
|
|
|
def get_full_article(self):
|
|
|
- r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
|
|
|
+ r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
|
|
|
while r.status_code != 200:
|
|
|
print("Waiting for rate-limiter...")
|
|
|
time.sleep(10)
|
|
|
r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
|
|
|
- return BeautifulSoup(r.text, 'html.parser')
|
|
|
+ self.full = BeautifulSoup(r.text, 'html.parser')
|
|
|
+
|
|
|
|
|
|
+# Load the existing file and gather the known URLS
|
|
|
+existing_xml = None
|
|
|
+known_links = []
|
|
|
+if os.path.isfile(outfile):
|
|
|
+ try:
|
|
|
+ existing_xml = etree.parse(outfile)
|
|
|
+ known_links = [link.text for link in existing_xml.iterfind('.//link')]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
|
|
|
+
|
|
|
+# Log in to the website
|
|
|
response = requests.post(
|
|
|
url="https://decorrespondent.nl/api2/account/password-authenticate",
|
|
|
headers={
|
|
@@ -85,31 +102,15 @@ Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4
|
|
|
Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
|
|
|
|
|
|
|
|
|
-
|
|
|
+# Get the page that contains the latest articles
|
|
|
req = requests.get("https://decorrespondent.nl/recent",
|
|
|
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
|
|
|
cookies=cookies_reuse)
|
|
|
soup = BeautifulSoup(req.text, 'html.parser')
|
|
|
articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
|
|
|
|
|
|
-feed = {"version": "https://jsonfeed.org/version/1",
|
|
|
- "title": "De Correspondent",
|
|
|
- "home_page_url": "https://decorrespondent.nl",
|
|
|
- "feed_url": "https://finetuned.nl/decorrespondent.json",
|
|
|
- "items": []}
|
|
|
-
|
|
|
-for article in articles:
|
|
|
- feed["items"].append({"id": id,
|
|
|
- "title": article.title,
|
|
|
- "link": article.link,
|
|
|
- "content_text": article.summary,
|
|
|
- "summary": article.summary,
|
|
|
- "date_published": article.date,
|
|
|
- "author": article.author,
|
|
|
- "rfc822_date": article.rfc_822_date})
|
|
|
|
|
|
# Construct XML feed
|
|
|
-
|
|
|
root = etree.Element("rss", version="2.0")
|
|
|
channel = etree.Element("channel")
|
|
|
title = etree.Element('title')
|
|
@@ -124,33 +125,42 @@ description = etree.Element('description')
|
|
|
description.text = "Een dagelijks medicijn tegen de waan van de dag"
|
|
|
channel.append(description)
|
|
|
|
|
|
-for article in feed['items']:
|
|
|
+for article in articles:
|
|
|
+ if article.link in known_links:
|
|
|
+ continue
|
|
|
+
|
|
|
item = etree.Element('item')
|
|
|
title = etree.Element('title')
|
|
|
- title.text = article['title']
|
|
|
+ title.text = article.title
|
|
|
item.append(title)
|
|
|
|
|
|
description = etree.Element('description')
|
|
|
- description.text = article['summary']
|
|
|
+ description.text = article.summary
|
|
|
item.append(description)
|
|
|
|
|
|
link = etree.Element('link')
|
|
|
- link.text = article['link']
|
|
|
+ link.text = article.link
|
|
|
item.append(link)
|
|
|
|
|
|
guid = etree.Element('guid')
|
|
|
- guid.text = article['link']
|
|
|
+ guid.text = article.link
|
|
|
item.append(guid)
|
|
|
|
|
|
pubDate = etree.Element('pubDate')
|
|
|
- pubDate.text = article['rfc822_date']
|
|
|
+ pubDate.text = article.rfc_822_date
|
|
|
item.append(pubDate)
|
|
|
|
|
|
author = etree.Element('author')
|
|
|
- author.text = article['author']
|
|
|
+ author.text = article.author
|
|
|
item.append(author)
|
|
|
|
|
|
channel.append(item)
|
|
|
+
|
|
|
+# Add previously loaded articles (if any)
|
|
|
+if existing_xml:
|
|
|
+ for item in existing_xml.iterfind('.//item'):
|
|
|
+ channel.append(item)
|
|
|
+
|
|
|
root.append(channel)
|
|
|
|
|
|
tree = etree.ElementTree(root)
|