|
@@ -0,0 +1,160 @@
|
|
|
+#!/usr/bin/env python3.7
|
|
|
+
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from lxml import etree
|
|
|
+from sys import argv
|
|
|
+from datetime import datetime
|
|
|
+import html
|
|
|
+import time
|
|
|
+
|
|
|
+
|
|
|
+if len(argv) != 4:
|
|
|
+ print("Usage: {argv[0]} USERNAME PASSWORD OUTFILE")
|
|
|
+ quit(1)
|
|
|
+
|
|
|
+emailaddr = argv[1]
|
|
|
+password = argv[2]
|
|
|
+outfile = argv[3]
|
|
|
+
|
|
|
+class Article:
|
|
|
+ def __init__(self, card):
|
|
|
+ self.card = card
|
|
|
+ self.full = self.get_full_article()
|
|
|
+
|
|
|
+ @property
|
|
|
+ def title(self):
|
|
|
+ return html.unescape(self.card.find('a', class_='publication-card__title').text.strip())
|
|
|
+
|
|
|
+ @property
|
|
|
+ def author(self):
|
|
|
+ authors = self.card.find('div', class_='publication-card__names').find_all('div')
|
|
|
+ return ", ".join([author.text.strip() for author in authors])
|
|
|
+
|
|
|
+ @property
|
|
|
+ def link(self):
|
|
|
+ return self.card.find('a', class_='publication-card__title')['href']
|
|
|
+
|
|
|
+ @property
|
|
|
+ def date(self):
|
|
|
+ return self.full.find('time')['datetime']
|
|
|
+
|
|
|
+ @property
|
|
|
+ def rfc_822_date(self):
|
|
|
+ return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
|
|
|
+
|
|
|
+ @property
|
|
|
+ def summary(self):
|
|
|
+ summary = self.full.find('div', class_='article-lead')
|
|
|
+ if not summary:
|
|
|
+ return None
|
|
|
+ return summary.text.strip()
|
|
|
+
|
|
|
+ def get_full_article(self):
|
|
|
+ r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
|
|
|
+ while r.status_code != 200:
|
|
|
+ print("Waiting for rate-limiter...")
|
|
|
+ time.sleep(10)
|
|
|
+ r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
|
|
|
+ return BeautifulSoup(r.text, 'html.parser')
|
|
|
+
|
|
|
+
|
|
|
+response = requests.post(
|
|
|
+ url="https://decorrespondent.nl/api2/account/password-authenticate",
|
|
|
+ headers={
|
|
|
+ "Content-Type": "application/json",
|
|
|
+ "Origin": "https://decorrespondent.nl",
|
|
|
+ "Pragma": "no-cache",
|
|
|
+ "Accept": "application/json",
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15",
|
|
|
+ "Referer": "https://decorrespondent.nl/inloggen",
|
|
|
+ "X-Requested-With": "XMLHttpRequest",
|
|
|
+ },
|
|
|
+ json={"emailAddress": emailaddr, "password": password},
|
|
|
+ stream = True
|
|
|
+)
|
|
|
+cookies = response.raw.headers["Set-Cookie"]
|
|
|
+cookies_list = cookies.split("; ")
|
|
|
+cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item}
|
|
|
+cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
|
|
|
+
|
|
|
+Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}
|
|
|
+Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+req = requests.get("https://decorrespondent.nl/recent",
|
|
|
+ headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
|
|
|
+ cookies=cookies_reuse)
|
|
|
+soup = BeautifulSoup(req.text, 'html.parser')
|
|
|
+articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
|
|
|
+
|
|
|
+feed = {"version": "https://jsonfeed.org/version/1",
|
|
|
+ "title": "De Correspondent",
|
|
|
+ "home_page_url": "https://decorrespondent.nl",
|
|
|
+ "feed_url": "https://finetuned.nl/decorrespondent.json",
|
|
|
+ "items": []}
|
|
|
+
|
|
|
+for article in articles:
|
|
|
+ feed["items"].append({"id": id,
|
|
|
+ "title": article.title,
|
|
|
+ "link": article.link,
|
|
|
+ "content_text": article.summary,
|
|
|
+ "summary": article.summary,
|
|
|
+ "date_published": article.date,
|
|
|
+ "author": article.author,
|
|
|
+ "rfc822_date": article.rfc_822_date})
|
|
|
+
|
|
|
+# Construct XML feed
|
|
|
+
|
|
|
+root = etree.Element("rss", version="2.0")
|
|
|
+channel = etree.Element("channel")
|
|
|
+title = etree.Element('title')
|
|
|
+title.text = "De Correspondent"
|
|
|
+channel.append(title)
|
|
|
+
|
|
|
+link = etree.Element('link')
|
|
|
+link.text = "https://decorrespondent.nl"
|
|
|
+channel.append(link)
|
|
|
+
|
|
|
+description = etree.Element('description')
|
|
|
+description.text = "Een dagelijks medicijn tegen de waan van de dag"
|
|
|
+channel.append(description)
|
|
|
+
|
|
|
+for article in feed['items']:
|
|
|
+ item = etree.Element('item')
|
|
|
+ title = etree.Element('title')
|
|
|
+ title.text = article['title']
|
|
|
+ item.append(title)
|
|
|
+
|
|
|
+ description = etree.Element('description')
|
|
|
+ description.text = article['summary']
|
|
|
+ item.append(description)
|
|
|
+
|
|
|
+ link = etree.Element('link')
|
|
|
+ link.text = article['link']
|
|
|
+ item.append(link)
|
|
|
+
|
|
|
+ guid = etree.Element('guid')
|
|
|
+ guid.text = article['link']
|
|
|
+ item.append(guid)
|
|
|
+
|
|
|
+ pubDate = etree.Element('pubDate')
|
|
|
+ pubDate.text = article['rfc822_date']
|
|
|
+ item.append(pubDate)
|
|
|
+
|
|
|
+ author = etree.Element('author')
|
|
|
+ author.text = article['author']
|
|
|
+ item.append(author)
|
|
|
+
|
|
|
+ channel.append(item)
|
|
|
+root.append(channel)
|
|
|
+
|
|
|
+tree = etree.ElementTree(root)
|
|
|
+
|
|
|
+with open(outfile, "wb") as file:
|
|
|
+ tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')
|
|
|
+
|
|
|
+
|
|
|
+
|