correspondent_rss.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/env python3.7
  2. import requests
  3. from bs4 import BeautifulSoup
  4. from lxml import etree
  5. from sys import argv
  6. from datetime import datetime
  7. import html
  8. import time
  9. if len(argv) != 4:
  10. print("Usage: {argv[0]} USERNAME PASSWORD OUTFILE")
  11. quit(1)
  12. emailaddr = argv[1]
  13. password = argv[2]
  14. outfile = argv[3]
  15. class Article:
  16. def __init__(self, card):
  17. self.card = card
  18. self.full = self.get_full_article()
  19. @property
  20. def title(self):
  21. return html.unescape(self.card.find('a', class_='publication-card__title').text.strip())
  22. @property
  23. def author(self):
  24. authors = self.card.find('div', class_='publication-card__names').find_all('div')
  25. return ", ".join([author.text.strip() for author in authors])
  26. @property
  27. def link(self):
  28. return self.card.find('a', class_='publication-card__title')['href']
  29. @property
  30. def date(self):
  31. return self.full.find('time')['datetime']
  32. @property
  33. def rfc_822_date(self):
  34. return datetime.strptime(self.date, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
  35. @property
  36. def summary(self):
  37. summary = self.full.find('div', class_='article-lead')
  38. if not summary:
  39. return None
  40. return summary.text.strip()
  41. def get_full_article(self):
  42. r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
  43. while r.status_code != 200:
  44. print("Waiting for rate-limiter...")
  45. time.sleep(10)
  46. r = requests.get(self.link, cookies=self.cookies, headers=self.headers)
  47. return BeautifulSoup(r.text, 'html.parser')
  48. response = requests.post(
  49. url="https://decorrespondent.nl/api2/account/password-authenticate",
  50. headers={
  51. "Content-Type": "application/json",
  52. "Origin": "https://decorrespondent.nl",
  53. "Pragma": "no-cache",
  54. "Accept": "application/json",
  55. "Cache-Control": "no-cache",
  56. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15",
  57. "Referer": "https://decorrespondent.nl/inloggen",
  58. "X-Requested-With": "XMLHttpRequest",
  59. },
  60. json={"emailAddress": emailaddr, "password": password},
  61. stream = True
  62. )
  63. cookies = response.raw.headers["Set-Cookie"]
  64. cookies_list = cookies.split("; ")
  65. cookies_dict = {item.split("=")[0]:item.split("=")[1] for item in cookies_list if "=" in item}
  66. cookies_reuse = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
  67. Article.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"}
  68. Article.cookies = {"session": cookies_dict["secure, session"], "cookies-cleaned": cookies_dict["cookies-cleaned"]}
  69. req = requests.get("https://decorrespondent.nl/recent",
  70. headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"},
  71. cookies=cookies_reuse)
  72. soup = BeautifulSoup(req.text, 'html.parser')
  73. articles = (Article(html) for html in soup.find_all('div', class_='publication-card__body'))
  74. feed = {"version": "https://jsonfeed.org/version/1",
  75. "title": "De Correspondent",
  76. "home_page_url": "https://decorrespondent.nl",
  77. "feed_url": "https://finetuned.nl/decorrespondent.json",
  78. "items": []}
  79. for article in articles:
  80. feed["items"].append({"id": id,
  81. "title": article.title,
  82. "link": article.link,
  83. "content_text": article.summary,
  84. "summary": article.summary,
  85. "date_published": article.date,
  86. "author": article.author,
  87. "rfc822_date": article.rfc_822_date})
  88. # Construct XML feed
  89. root = etree.Element("rss", version="2.0")
  90. channel = etree.Element("channel")
  91. title = etree.Element('title')
  92. title.text = "De Correspondent"
  93. channel.append(title)
  94. link = etree.Element('link')
  95. link.text = "https://decorrespondent.nl"
  96. channel.append(link)
  97. description = etree.Element('description')
  98. description.text = "Een dagelijks medicijn tegen de waan van de dag"
  99. channel.append(description)
  100. for article in feed['items']:
  101. item = etree.Element('item')
  102. title = etree.Element('title')
  103. title.text = article['title']
  104. item.append(title)
  105. description = etree.Element('description')
  106. description.text = article['summary']
  107. item.append(description)
  108. link = etree.Element('link')
  109. link.text = article['link']
  110. item.append(link)
  111. guid = etree.Element('guid')
  112. guid.text = article['link']
  113. item.append(guid)
  114. pubDate = etree.Element('pubDate')
  115. pubDate.text = article['rfc822_date']
  116. item.append(pubDate)
  117. author = etree.Element('author')
  118. author.text = article['author']
  119. item.append(author)
  120. channel.append(item)
  121. root.append(channel)
  122. tree = etree.ElementTree(root)
  123. with open(outfile, "wb") as file:
  124. tree.write(file, pretty_print=True, xml_declaration=True, encoding='utf-8')