#!/usr/local/bin/python3.7 import requests from bs4 import BeautifulSoup from lxml import etree from sys import argv from datetime import datetime import time import argparse import os.path parser = argparse.ArgumentParser() parser.add_argument("username", type=str, help="Je gebruikersname (e-mailadres) voor de correspondent") parser.add_argument("password", type=str, help="Je wachtwoord voor de correspondent") parser.add_argument("outfile", type=str, help="Volledig pad met filename waar de XML moet worden gezet") args = parser.parse_args() emailaddr = args.username password = args.password outfile = args.outfile class Article: def __init__(self, card): self.card = card self.full = None @property def title(self): return self.card.find('a', class_='publication-card__title').text.strip() @property def author(self): authors = self.card.find('div', class_='publication-card__names').find_all('div') return ", ".join([author.text.strip() for author in authors]) @property def link(self): return self.card.find('a', class_='publication-card__title')['href'] @property def date(self): if not self.full: self.get_full_article() return self.full.find('time')['datetime'] @property def rfc_822_date(self): return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z") @property def summary(self): if not self.full: self.get_full_article() summary = self.full.find('div', class_='article-lead') if not summary: return None return summary.text.strip() def get_full_article(self): r = requests.get(self.link, cookies=self.cookies, headers=self.headers) while r.status_code != 200: print("Waiting for rate-limiter...") time.sleep(10) r = requests.get(self.link, cookies=self.cookies, headers=self.headers) self.full = BeautifulSoup(r.text, 'html.parser') # Load the existing file and gather the known URLS existing_xml = None known_links = [] if os.path.isfile(outfile): try: existing_xml = etree.parse(outfile) known_links = [link.text for link in existing_xml.iterfind('.//link')] except: pass # Log in to the website response = requests.post( url="https://decorrespondent.nl/api2/account/password-authenticate", headers={ "Content-Type": "application/json", "Origin": "https://decorrespondent.nl", "Pragma": "no-cache", "Accept": "application/json", "Cache-Control": "no-cache", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15", "Referer": "https://decorrespondent.nl/inloggen", "X-Requested-With": "XMLHttpRequest", }, json={"emailAddress": emailaddr, "password": password}, stream = True ) cookies = response.raw.headers["Set-Cookie"] cookies_list = cookies.split("; ") cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item} cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]} Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"} Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]} # Get the page that contains the latest articles req = requests.get("https://decorrespondent.nl/recent", headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}, cookies=cookies_reuse) soup = BeautifulSoup(req.text, 'html.parser') articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body')) # Construct XML feed root = etree.Element("rss", version="2.0") channel = etree.Element("channel") title = etree.Element('title') title.text = "De Correspondent" channel.append(title) link = etree.Element('link') link.text = "https://decorrespondent.nl" channel.append(link) description = etree.Element('description') description.text = "Een dagelijks medicijn tegen de waan van de dag" channel.append(description) for article in articles: if article.link in known_links: continue item = etree.Element('item') title = etree.Element('title') title.text = article.title item.append(title) description = etree.Element('description') description.text = article.summary item.append(description) link = etree.Element('link') link.text = article.link item.append(link) guid = etree.Element('guid') guid.text = article.link item.append(guid) pubDate = etree.Element('pubDate') pubDate.text = article.rfc_822_date item.append(pubDate) author = etree.Element('author') author.text = article.author item.append(author) channel.append(item) # Add previously loaded articles (if any) if existing_xml: for item in existing_xml.iterfind('.//item'): channel.append(item) root.append(channel) tree = etree.ElementTree(root) with open(outfile, "wb") as file: tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')