correspondent_rss.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. #!/usr/local/bin/python3.7
  2. import requests
  3. from bs4 import BeautifulSoup
  4. from lxml import etree
  5. from sys import argv
  6. from datetime import datetime
  7. import time
  8. import argparse
  9. parser = argparse.ArgumentParser()
  10. parser.add_argument("-u", "--username", required=True, type=str, help="Je gebruikersname (e-mailadres) voor de correspondent")
  11. parser.add_argument("-p", "--password", required=True,type=str, help="Je wachtwoord voor de correspondent")
  12. parser.add_argument("-o", "--outfile", required=True, type=str, help="Volledig pad met filename waar de XML moet worden gezet")
  13. args = parser.parse_args()
  14. emailaddr = args.username
  15. password = args.password
  16. outfile = args.outfile
  17. class Article:
  18. def __init__(self, card):
  19. self.card = card
  20. self.full = self.get_full_article()
  21. @property
  22. def title(self):
  23. return self.card.find('a', class_='publication-card__title').text.strip()
  24. @property
  25. def author(self):
  26. authors = self.card.find('div', class_='publication-card__names').find_all('div')
  27. return ", ".join([author.text.strip() for author in authors])
  28. @property
  29. def link(self):
  30. return self.card.find('a', class_='publication-card__title')['href']
  31. @property
  32. def date(self):
  33. return self.full.find('time')['datetime']
  34. @property
  35. def rfc_822_date(self):
  36. return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
  37. @property
  38. def summary(self):
  39. summary = self.full.find('div', class_='article-lead')
  40. if not summary:
  41. return None
  42. return summary.text.strip()
  43. def get_full_article(self):
  44. r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
  45. while r.status_code != 200:
  46. print("Waiting for rate-limiter...")
  47. time.sleep(10)
  48. r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
  49. return BeautifulSoup(r.text, 'html.parser')
  50. response = requests.post(
  51. url="https://decorrespondent.nl/api2/account/password-authenticate",
  52. headers={
  53. "Content-Type": "application/json",
  54. "Origin": "https://decorrespondent.nl",
  55. "Pragma": "no-cache",
  56. "Accept": "application/json",
  57. "Cache-Control": "no-cache",
  58. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15",
  59. "Referer": "https://decorrespondent.nl/inloggen",
  60. "X-Requested-With": "XMLHttpRequest",
  61. },
  62. json={"emailAddress": emailaddr, "password": password},
  63. stream = True
  64. )
  65. cookies = response.raw.headers["Set-Cookie"]
  66. cookies_list = cookies.split("; ")
  67. cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item}
  68. cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
  69. Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}
  70. Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
  71. req = requests.get("https://decorrespondent.nl/recent",
  72. headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
  73. cookies=cookies_reuse)
  74. soup = BeautifulSoup(req.text, 'html.parser')
  75. articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
  76. feed = {"version": "https://jsonfeed.org/version/1",
  77. "title": "De Correspondent",
  78. "home_page_url": "https://decorrespondent.nl",
  79. "feed_url": "https://finetuned.nl/decorrespondent.json",
  80. "items": []}
  81. for article in articles:
  82. feed["items"].append({"id": id,
  83. "title": article.title,
  84. "link": article.link,
  85. "content_text": article.summary,
  86. "summary": article.summary,
  87. "date_published": article.date,
  88. "author": article.author,
  89. "rfc822_date": article.rfc_822_date})
  90. # Construct XML feed
  91. root = etree.Element("rss", version="2.0")
  92. channel = etree.Element("channel")
  93. title = etree.Element('title')
  94. title.text = "De Correspondent"
  95. channel.append(title)
  96. link = etree.Element('link')
  97. link.text = "https://decorrespondent.nl"
  98. channel.append(link)
  99. description = etree.Element('description')
  100. description.text = "Een dagelijks medicijn tegen de waan van de dag"
  101. channel.append(description)
  102. for article in feed['items']:
  103. item = etree.Element('item')
  104. title = etree.Element('title')
  105. title.text = article['title']
  106. item.append(title)
  107. description = etree.Element('description')
  108. description.text = article['summary']
  109. item.append(description)
  110. link = etree.Element('link')
  111. link.text = article['link']
  112. item.append(link)
  113. guid = etree.Element('guid')
  114. guid.text = article['link']
  115. item.append(guid)
  116. pubDate = etree.Element('pubDate')
  117. pubDate.text = article['rfc822_date']
  118. item.append(pubDate)
  119. author = etree.Element('author')
  120. author.text = article['author']
  121. item.append(author)
  122. channel.append(item)
  123. root.append(channel)
  124. tree = etree.ElementTree(root)
  125. with open(outfile, "wb") as file:
  126. tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')