commit 12bd0027f9d30637b2ea8fe77a282656cae61230 Author: 雲華 <42814579+yunwah@users.noreply.github.com> Date: Tue Aug 17 17:33:52 2021 -0400 Initial commit Added the first implementation of scraping the website. The "thumbnail" option is disabled in the embeds sent because they are not properly handled on Guilded yet (cropping is wonky). This script should also work for any Discord webhook URLs, but it hasn't been tested. There currently isn't a way to pre-populate the database with hashes unless you manually do it before hand. This is something I'm looking at adding in the future probably. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a604032 --- /dev/null +++ b/.gitignore @@ -0,0 +1,144 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# Database +*.db + +# Pycharm +.idea/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..088fa53 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:slim + +COPY . /lanews + +WORKDIR /lanews + +RUN python3 -m venv venv && \ + venv/bin/pip install -r requirements.txt + +CMD ["venv/bin/python", "main.py"] \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..2c7659e --- /dev/null +++ b/main.py @@ -0,0 +1,62 @@ +from datetime import datetime as dt +from dotenv import load_dotenv +from news import NewsScraper +from os.path import join, dirname +import schedule +import os +import asyncio +import json +import time + +dotenv_path = join(dirname(__file__), '.env') +load_dotenv(dotenv_path) + +WEBHOOK_URL = os.environ.get('WEBHOOK_URL') +loop = asyncio.get_event_loop() + + +async def publish_news(): + print('Running web scrape...') + la_news = NewsScraper(loop=loop) + articles = await la_news.news_articles() + + if bool(articles): + for article in articles: + payload = { + "content": None, + "embeds": [ + { + "title": article['title'].replace("'", "\\'"), + "description": article['preview_text'].replace("'", "\\'"), + "url": article['article_link'].replace("'", "\\'"), + "color": 5814783, + "footer": { + "text": article['tag'] + }, + "timestamp": f"{dt.utcnow()}", + "image": { + "url": article['image_preview'] + }, + #"thumbnail": { + # "url": "https://images.ctfassets.net/umhrp0op95v1/S3yKwaVAOi8Bgqg4n4scf" + # "/adae769671b271b88f97d31721432986/LA_LOGO.png " + #} + } + ] + } + resp = await la_news.client.post(url=WEBHOOK_URL, data=json.dumps(payload).encode('UTF-8'), headers={'Content-Type': 'application/json'}) + print(resp.status) + await la_news.close() + + +def run_async(coroutine): + task = coroutine() + loop.run_until_complete(task) + + +schedule.every().hour.do(run_async, publish_news) + +while True: + print('Checking schedule...') + schedule.run_pending() + time.sleep(300) diff --git a/news.py b/news.py new file mode 100644 index 0000000..dbf5816 --- /dev/null +++ b/news.py @@ -0,0 +1,83 @@ +from lxml.html import fromstring +from aiohttp import ClientSession +from sqlite3 import Error +import hashlib +import sqlite3 + +BASE_URL = 'https://www.playlostark.com' + + +def _create_connection(db_file): + conn = None + try: + conn = sqlite3.connect(db_file) + conn.execute('CREATE TABLE IF NOT EXISTS news_hashes(hash TEXT NOT NULL);') + conn.commit() + except Error as e: + print(e) + return conn + + +class NewsScraper: + def __init__(self, loop=None, client=None, database=None): + if loop is None and client is None: + raise Exception() + + self.client = client if None else ClientSession(loop=loop) + self.database = _create_connection('news.db') if database is None else database + self._md5 = hashlib.new('md5', usedforsecurity=False) + + async def _fetch_url(self, url): + async with self.client.get(url=url) as resp: + return await resp.text() + + def _store_hash(self, _hash, table): + with self.database as db: + cur = db.cursor() + cur.execute(f'INSERT INTO {table} VALUES(?);', (_hash,)) + db.commit() + + def _check_hash(self, _hash, table): + with self.database as db: + cur = db.cursor() + cur.execute(f'SELECT * FROM {table} WHERE hash=?;', (_hash,)) + result = cur.fetchone() + return result + + async def news_articles(self): + """ + XPath: //div[@class="ags-SlotModule ags-SlotModule--blog ags-SlotModule--threePerRow"] + :return: + """ + url = BASE_URL + '/en-us/news' + + raw_html = await self._fetch_url(url) + doc = fromstring(raw_html) + article_box = doc.xpath('//div[@class="ags-SlotModule ags-SlotModule--blog ags-SlotModule--threePerRow"]') + + articles = [] + for article in article_box: + article_meta = { + 'title': str(article.xpath('.//span[@class="ags-SlotModule-contentContainer-heading ' + 'ags-SlotModule-contentContainer-heading ' + 'ags-SlotModule-contentContainer-heading--blog"]/text()')[0]).strip(), + 'preview_text': str(article.xpath('.//div[@class="ags-SlotModule-contentContainer-text ' + 'ags-SlotModule-contentContainer-text--blog ' + 'ags-SlotModule-contentContainer-text"]/text()')[0]).strip(), + 'article_link': BASE_URL + article.xpath('.//a[@class="ags-SlotModule-spacer"]/@href')[0].strip(), + 'image_preview': 'https:' + article.xpath('.//img[@class="ags-SlotModule-imageContainer-image"]/@src') + [0], + 'tag': str(article.xpath('.//h4[@class="ags-SlotModule-aboveImageBlogTag"]/text()')[0]).strip(), + } + + _hash = hashlib.md5(article_meta.__str__().encode('UTF-8'), usedforsecurity=False).hexdigest() + if self._check_hash(_hash, 'news_hashes'): + return + else: + self._store_hash(_hash, 'news_hashes') + + articles.append(article_meta) + return articles.reverse() + + async def close(self): + await self.client.close() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6de5e4e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +python-dotenv~=0.19.0 +lxml~=4.6.2 +aiohttp~=3.7.3 +schedule~=1.1.0 \ No newline at end of file