Initial commit

Added the first implementation of scraping the website. The "thumbnail"
option is disabled in the embeds sent because they are not properly
handled on Guilded yet (cropping is wonky). This script should also work
for any Discord webhook URLs, but it hasn't been tested. There currently
isn't a way to pre-populate the database with hashes unless you manually
do it before hand. This is something I'm looking at adding in the future
probably.
This commit is contained in:
雲華
2021-08-17 17:33:52 -04:00
commit 12bd0027f9
5 changed files with 303 additions and 0 deletions

144
.gitignore vendored Normal file
View File

@@ -0,0 +1,144 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# Database
*.db
# Pycharm
.idea/

10
Dockerfile Normal file
View File

@@ -0,0 +1,10 @@
FROM python:slim
COPY . /lanews
WORKDIR /lanews
RUN python3 -m venv venv && \
venv/bin/pip install -r requirements.txt
CMD ["venv/bin/python", "main.py"]

62
main.py Normal file
View File

@@ -0,0 +1,62 @@
from datetime import datetime as dt
from dotenv import load_dotenv
from news import NewsScraper
from os.path import join, dirname
import schedule
import os
import asyncio
import json
import time
dotenv_path = join(dirname(__file__), '.env')
load_dotenv(dotenv_path)
WEBHOOK_URL = os.environ.get('WEBHOOK_URL')
loop = asyncio.get_event_loop()
async def publish_news():
print('Running web scrape...')
la_news = NewsScraper(loop=loop)
articles = await la_news.news_articles()
if bool(articles):
for article in articles:
payload = {
"content": None,
"embeds": [
{
"title": article['title'].replace("'", "\\'"),
"description": article['preview_text'].replace("'", "\\'"),
"url": article['article_link'].replace("'", "\\'"),
"color": 5814783,
"footer": {
"text": article['tag']
},
"timestamp": f"{dt.utcnow()}",
"image": {
"url": article['image_preview']
},
#"thumbnail": {
# "url": "https://images.ctfassets.net/umhrp0op95v1/S3yKwaVAOi8Bgqg4n4scf"
# "/adae769671b271b88f97d31721432986/LA_LOGO.png "
#}
}
]
}
resp = await la_news.client.post(url=WEBHOOK_URL, data=json.dumps(payload).encode('UTF-8'), headers={'Content-Type': 'application/json'})
print(resp.status)
await la_news.close()
def run_async(coroutine):
task = coroutine()
loop.run_until_complete(task)
schedule.every().hour.do(run_async, publish_news)
while True:
print('Checking schedule...')
schedule.run_pending()
time.sleep(300)

83
news.py Normal file
View File

@@ -0,0 +1,83 @@
from lxml.html import fromstring
from aiohttp import ClientSession
from sqlite3 import Error
import hashlib
import sqlite3
BASE_URL = 'https://www.playlostark.com'
def _create_connection(db_file):
conn = None
try:
conn = sqlite3.connect(db_file)
conn.execute('CREATE TABLE IF NOT EXISTS news_hashes(hash TEXT NOT NULL);')
conn.commit()
except Error as e:
print(e)
return conn
class NewsScraper:
def __init__(self, loop=None, client=None, database=None):
if loop is None and client is None:
raise Exception()
self.client = client if None else ClientSession(loop=loop)
self.database = _create_connection('news.db') if database is None else database
self._md5 = hashlib.new('md5', usedforsecurity=False)
async def _fetch_url(self, url):
async with self.client.get(url=url) as resp:
return await resp.text()
def _store_hash(self, _hash, table):
with self.database as db:
cur = db.cursor()
cur.execute(f'INSERT INTO {table} VALUES(?);', (_hash,))
db.commit()
def _check_hash(self, _hash, table):
with self.database as db:
cur = db.cursor()
cur.execute(f'SELECT * FROM {table} WHERE hash=?;', (_hash,))
result = cur.fetchone()
return result
async def news_articles(self):
"""
XPath: //div[@class="ags-SlotModule ags-SlotModule--blog ags-SlotModule--threePerRow"]
:return:
"""
url = BASE_URL + '/en-us/news'
raw_html = await self._fetch_url(url)
doc = fromstring(raw_html)
article_box = doc.xpath('//div[@class="ags-SlotModule ags-SlotModule--blog ags-SlotModule--threePerRow"]')
articles = []
for article in article_box:
article_meta = {
'title': str(article.xpath('.//span[@class="ags-SlotModule-contentContainer-heading '
'ags-SlotModule-contentContainer-heading '
'ags-SlotModule-contentContainer-heading--blog"]/text()')[0]).strip(),
'preview_text': str(article.xpath('.//div[@class="ags-SlotModule-contentContainer-text '
'ags-SlotModule-contentContainer-text--blog '
'ags-SlotModule-contentContainer-text"]/text()')[0]).strip(),
'article_link': BASE_URL + article.xpath('.//a[@class="ags-SlotModule-spacer"]/@href')[0].strip(),
'image_preview': 'https:' + article.xpath('.//img[@class="ags-SlotModule-imageContainer-image"]/@src')
[0],
'tag': str(article.xpath('.//h4[@class="ags-SlotModule-aboveImageBlogTag"]/text()')[0]).strip(),
}
_hash = hashlib.md5(article_meta.__str__().encode('UTF-8'), usedforsecurity=False).hexdigest()
if self._check_hash(_hash, 'news_hashes'):
return
else:
self._store_hash(_hash, 'news_hashes')
articles.append(article_meta)
return articles.reverse()
async def close(self):
await self.client.close()

4
requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
python-dotenv~=0.19.0
lxml~=4.6.2
aiohttp~=3.7.3
schedule~=1.1.0