mirror of
https://github.com/hpware/news-analyze.git
synced 2025-06-23 21:14:23 +00:00
Made the line_today.py kinda work ig. But I have no idea how can I run
this without issues in prod tho. and the "BlurPageBeforeLogin" thing works just file, oh and checkCookie is now working (but without the database part just yet)
This commit is contained in:
parent
0e26a23261
commit
81012f5061
6 changed files with 91 additions and 36 deletions
|
@ -10,12 +10,21 @@ import dotenv
|
|||
import os
|
||||
import gzip
|
||||
import io
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="A LINE Today Scraper.")
|
||||
parser.add_argument("-s", "--slug", type=str, help="The article URL like: oqmazXP")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.slug:
|
||||
print("No Slug entered, please use -s oqmazXP as a demo.")
|
||||
exit(1)
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze) (A little note: If you see this, It means that your website is being scraped by other people, not the user hpware. Please keep that in mind at don't spam issues, I can't fix it.)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
|
@ -28,9 +37,8 @@ headers = {
|
|||
}
|
||||
|
||||
|
||||
url = "https://today.line.me/tw/v2/article/oqmazXP"
|
||||
|
||||
try:
|
||||
url = "https://today.line.me/tw/v2/article/" + args.slug
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
|
@ -38,20 +46,20 @@ try:
|
|||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
|
||||
# Extract content
|
||||
title = soup.find('h1', class_="entityTitle")
|
||||
title = soup.find('h1', class_="entityTitle")
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('article', class_="news-content")
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue