correspondent_rss.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #!/usr/local/bin/python3.7
  2. import requests
  3. from bs4 import BeautifulSoup
  4. from lxml import etree
  5. from sys import argv
  6. from datetime import datetime
  7. import time
  8. import argparse
  9. import os.path
  10. parser = argparse.ArgumentParser()
  11. parser.add_argument("username", type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
  12. parser.add_argument("password", type=str, help="Je wachtwoord voor de correspondent")
  13. parser.add_argument("outfile", type=str, help="Volledig pad met filename waar de XML moet worden gezet")
  14. args = parser.parse_args()
  15. emailaddr = args.username
  16. password = args.password
  17. outfile = args.outfile
  18. class Article:
  19. def __init__(self, card):
  20. self.card = card
  21. self.full = None
  22. @property
  23. def title(self):
  24. return self.card.find('a', class_='publication-card__title').text.strip()
  25. @property
  26. def author(self):
  27. authors = self.card.find('div', class_='publication-card__names').find_all('div')
  28. return ", ".join([author.text.strip() for author in authors])
  29. @property
  30. def link(self):
  31. return self.card.find('a', class_='publication-card__title')['href']
  32. @property
  33. def date(self):
  34. if not self.full:
  35. self.get_full_article()
  36. return self.full.find('time')['datetime']
  37. @property
  38. def rfc_822_date(self):
  39. return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
  40. @property
  41. def summary(self):
  42. if not self.full:
  43. self.get_full_article()
  44. summary = self.full.find('div', class_='article-lead')
  45. if not summary:
  46. return None
  47. return summary.text.strip()
  48. def get_full_article(self):
  49. r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
  50. while r.status_code != 200:
  51. print("Waiting for rate-limiter...")
  52. time.sleep(10)
  53. r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
  54. self.full = BeautifulSoup(r.text, 'html.parser')
  55. # Load the existing file and gather the known URLS
  56. existing_xml = None
  57. known_links = []
  58. if os.path.isfile(outfile):
  59. try:
  60. existing_xml = etree.parse(outfile)
  61. known_links = [link.text for link in existing_xml.iterfind('.//link')]
  62. except:
  63. pass
  64. # Log in to the website
  65. response = requests.post(
  66. url="https://decorrespondent.nl/api2/account/password-authenticate",
  67. headers={
  68. "Content-Type": "application/json",
  69. "Origin": "https://decorrespondent.nl",
  70. "Pragma": "no-cache",
  71. "Accept": "application/json",
  72. "Cache-Control": "no-cache",
  73. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15",
  74. "Referer": "https://decorrespondent.nl/inloggen",
  75. "X-Requested-With": "XMLHttpRequest",
  76. },
  77. json={"emailAddress": emailaddr, "password": password},
  78. stream = True
  79. )
  80. cookies = response.raw.headers["Set-Cookie"]
  81. cookies_list = cookies.split("; ")
  82. cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item}
  83. cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
  84. Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}
  85. Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
  86. # Get the page that contains the latest articles
  87. req = requests.get("https://decorrespondent.nl/recent",
  88. headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
  89. cookies=cookies_reuse)
  90. soup = BeautifulSoup(req.text, 'html.parser')
  91. articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
  92. # Construct XML feed
  93. root = etree.Element("rss", version="2.0")
  94. channel = etree.Element("channel")
  95. title = etree.Element('title')
  96. title.text = "De Correspondent"
  97. channel.append(title)
  98. link = etree.Element('link')
  99. link.text = "https://decorrespondent.nl"
  100. channel.append(link)
  101. description = etree.Element('description')
  102. description.text = "Een dagelijks medicijn tegen de waan van de dag"
  103. channel.append(description)
  104. for article in articles:
  105. if article.link in known_links:
  106. continue
  107. item = etree.Element('item')
  108. title = etree.Element('title')
  109. title.text = article.title
  110. item.append(title)
  111. description = etree.Element('description')
  112. description.text = article.summary
  113. item.append(description)
  114. link = etree.Element('link')
  115. link.text = article.link
  116. item.append(link)
  117. guid = etree.Element('guid')
  118. guid.text = article.link
  119. item.append(guid)
  120. pubDate = etree.Element('pubDate')
  121. pubDate.text = article.rfc_822_date
  122. item.append(pubDate)
  123. author = etree.Element('author')
  124. author.text = article.author
  125. item.append(author)
  126. channel.append(item)
  127. # Add previously loaded articles (if any)
  128. if existing_xml:
  129. for item in existing_xml.iterfind('.//item'):
  130. channel.append(item)
  131. root.append(channel)
  132. tree = etree.ElementTree(root)
  133. with open(outfile, "wb") as file:
  134. tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')