Initial commit

Added the first implementation of scraping the website. The "thumbnail" option is disabled in the embeds sent because they are not properly handled on Guilded yet (cropping is wonky). This script should also work for any Discord webhook URLs, but it hasn't been tested. There currently isn't a way to pre-populate the database with hashes unless you manually do it before hand. This is something I'm looking at adding in the future probably.
2021-08-17 17:33:52 -04:00
commit 12bd0027f9
5 changed files with 303 additions and 0 deletions
--- a/news.py
+++ b/news.py
@@ -0,0 +1,83 @@
+from lxml.html import fromstring
+from aiohttp import ClientSession
+from sqlite3 import Error
+import hashlib
+import sqlite3
+
+BASE_URL = 'https://www.playlostark.com'
+
+
+def _create_connection(db_file):
+    conn = None
+    try:
+        conn = sqlite3.connect(db_file)
+        conn.execute('CREATE TABLE IF NOT EXISTS news_hashes(hash TEXT NOT NULL);')
+        conn.commit()
+    except Error as e:
+        print(e)
+    return conn
+
+
+class NewsScraper:
+    def __init__(self, loop=None, client=None, database=None):
+        if loop is None and client is None:
+            raise Exception()
+
+        self.client = client if None else ClientSession(loop=loop)
+        self.database = _create_connection('news.db') if database is None else database
+        self._md5 = hashlib.new('md5', usedforsecurity=False)
+
+    async def _fetch_url(self, url):
+        async with self.client.get(url=url) as resp:
+            return await resp.text()
+
+    def _store_hash(self, _hash, table):
+        with self.database as db:
+            cur = db.cursor()
+            cur.execute(f'INSERT INTO {table} VALUES(?);', (_hash,))
+            db.commit()
+
+    def _check_hash(self, _hash, table):
+        with self.database as db:
+            cur = db.cursor()
+            cur.execute(f'SELECT * FROM {table} WHERE hash=?;', (_hash,))
+            result = cur.fetchone()
+        return result
+
+    async def news_articles(self):
+        """
+        XPath: //div[@class="ags-SlotModule ags-SlotModule--blog ags-SlotModule--threePerRow"]
+        :return:
+        """
+        url = BASE_URL + '/en-us/news'
+
+        raw_html = await self._fetch_url(url)
+        doc = fromstring(raw_html)
+        article_box = doc.xpath('//div[@class="ags-SlotModule ags-SlotModule--blog ags-SlotModule--threePerRow"]')
+
+        articles = []
+        for article in article_box:
+            article_meta = {
+                'title': str(article.xpath('.//span[@class="ags-SlotModule-contentContainer-heading '
+                                           'ags-SlotModule-contentContainer-heading '
+                                           'ags-SlotModule-contentContainer-heading--blog"]/text()')[0]).strip(),
+                'preview_text': str(article.xpath('.//div[@class="ags-SlotModule-contentContainer-text '
+                                                  'ags-SlotModule-contentContainer-text--blog '
+                                                  'ags-SlotModule-contentContainer-text"]/text()')[0]).strip(),
+                'article_link': BASE_URL + article.xpath('.//a[@class="ags-SlotModule-spacer"]/@href')[0].strip(),
+                'image_preview': 'https:' + article.xpath('.//img[@class="ags-SlotModule-imageContainer-image"]/@src')
+                [0],
+                'tag': str(article.xpath('.//h4[@class="ags-SlotModule-aboveImageBlogTag"]/text()')[0]).strip(),
+            }
+
+            _hash = hashlib.md5(article_meta.__str__().encode('UTF-8'), usedforsecurity=False).hexdigest()
+            if self._check_hash(_hash, 'news_hashes'):
+                return
+            else:
+                self._store_hash(_hash, 'news_hashes')
+
+            articles.append(article_meta)
+        return articles.reverse()
+
+    async def close(self):
+        await self.client.close()