123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- #!/usr/local/bin/python3.7
- import requests
- from bs4 import BeautifulSoup
- from lxml import etree
- from sys import argv
- from datetime import datetime
- import time
- import argparse
- import os.path
- parser = argparse.ArgumentParser()
- parser.add_argument("username", type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
- parser.add_argument("password", type=str, help="Je wachtwoord voor de correspondent")
- parser.add_argument("outfile", type=str, help="Volledig pad met filename waar de XML moet worden gezet")
- args = parser.parse_args()
- emailaddr = args.username
- password = args.password
- outfile = args.outfile
- class Article:
- def __init__(self, card):
- self.card = card
- self.full = None
- @property
- def title(self):
- return self.card.find('a', class_='publication-card__title').text.strip()
- @property
- def author(self):
- authors = self.card.find('div', class_='publication-card__names').find_all('div')
- return ", ".join([author.text.strip() for author in authors])
- @property
- def link(self):
- return self.card.find('a', class_='publication-card__title')['href']
- @property
- def date(self):
- if not self.full:
- self.get_full_article()
- return self.full.find('time')['datetime']
- @property
- def rfc_822_date(self):
- return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
- @property
- def summary(self):
- if not self.full:
- self.get_full_article()
- summary = self.full.find('div', class_='article-lead')
- if not summary:
- return None
- return summary.text.strip()
- def get_full_article(self):
- r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
- while r.status_code != 200:
- print("Waiting for rate-limiter...")
- time.sleep(10)
- r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
- self.full = BeautifulSoup(r.text, 'html.parser')
- # Load the existing file and gather the known URLS
- existing_xml = None
- known_links = []
- if os.path.isfile(outfile):
- try:
- existing_xml = etree.parse(outfile)
- known_links = [link.text for link in existing_xml.iterfind('.//link')]
- except:
- pass
- # Log in to the website
- response = requests.post(
- url="https://decorrespondent.nl/api2/account/password-authenticate",
- headers={
- "Content-Type": "application/json",
- "Origin": "https://decorrespondent.nl",
- "Pragma": "no-cache",
- "Accept": "application/json",
- "Cache-Control": "no-cache",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15",
- "Referer": "https://decorrespondent.nl/inloggen",
- "X-Requested-With": "XMLHttpRequest",
- },
- json={"emailAddress": emailaddr, "password": password},
- stream = True
- )
- cookies = response.raw.headers["Set-Cookie"]
- cookies_list = cookies.split("; ")
- cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item}
- cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
- Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}
- Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
- # Get the page that contains the latest articles
- req = requests.get("https://decorrespondent.nl/recent",
- headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
- cookies=cookies_reuse)
- soup = BeautifulSoup(req.text, 'html.parser')
- articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
- # Construct XML feed
- root = etree.Element("rss", version="2.0")
- channel = etree.Element("channel")
- title = etree.Element('title')
- title.text = "De Correspondent"
- channel.append(title)
- link = etree.Element('link')
- link.text = "https://decorrespondent.nl"
- channel.append(link)
- description = etree.Element('description')
- description.text = "Een dagelijks medicijn tegen de waan van de dag"
- channel.append(description)
- for article in articles:
- if article.link in known_links:
- continue
- item = etree.Element('item')
- title = etree.Element('title')
- title.text = article.title
- item.append(title)
- description = etree.Element('description')
- description.text = article.summary
- item.append(description)
- link = etree.Element('link')
- link.text = article.link
- item.append(link)
- guid = etree.Element('guid')
- guid.text = article.link
- item.append(guid)
- pubDate = etree.Element('pubDate')
- pubDate.text = article.rfc_822_date
- item.append(pubDate)
- author = etree.Element('author')
- author.text = article.author
- item.append(author)
- channel.append(item)
- # Add previously loaded articles (if any)
- if existing_xml:
- for item in existing_xml.iterfind('.//item'):
- channel.append(item)
- root.append(channel)
- tree = etree.ElementTree(root)
- with open(outfile, "wb") as file:
- tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')
|