mirror of
https://github.com/hpware/news-analyze.git
synced 2025-06-24 05:24:23 +00:00
feat: update README with folder structure and execution instructions; add scraping scripts for news articles
This commit is contained in:
parent
c68606ffbe
commit
f8fa412de9
7 changed files with 188 additions and 41 deletions
|
@ -1,73 +0,0 @@
|
|||
# Going to love the stupid tab tab tab tab system, instead of using brances.
|
||||
import re # Regular expressions
|
||||
from urllib.request import urlopen # URL request lib.
|
||||
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
# Connect to PostgresDB
|
||||
#conn = psycopg2.connect(database=os.getenv("POSTGRES_DB"),
|
||||
# user=os.getenv("POSTGRES_USER"),
|
||||
# password=os.getenv("POSTGRES_PASSWORD"),
|
||||
# host=os.getenv("POSTGRES_HOST"),
|
||||
# port=os.getenv("POSTGRES_PORT")
|
||||
#)
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
|
||||
}
|
||||
|
||||
#url = "https://tw.news.yahoo.com/"
|
||||
#url = "https://news.google.com/home?hl=zh-TW&gl=TW&ceid=TW:zh-Hant"
|
||||
url = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFptTXpJU0JYcG9MVlJYS0FBUAE?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant"
|
||||
|
||||
topiccwiz_css = "PO9Zff Ccj79 kUVvS"
|
||||
page = urlopen(url)
|
||||
html_bytes = page.read()
|
||||
html = html_bytes.decode("utf-8")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Find the topiccwiz element
|
||||
topiccwiz = soup.find_all("c-wiz", class_=topiccwiz_css)
|
||||
news_data = []
|
||||
index = 0
|
||||
for item in topiccwiz:
|
||||
index+= 1
|
||||
array = []
|
||||
hotarticles = item.find_all("article")
|
||||
first_article = ""
|
||||
passed_first = False
|
||||
for article in hotarticles:
|
||||
try:
|
||||
title_elem = article.find('a', class_='gPFEn')
|
||||
title = title_elem.text.strip() if title_elem else ''
|
||||
|
||||
source_elem = article.find('div', class_='vr1PYe')
|
||||
source = source_elem.text.strip() if source_elem else ''
|
||||
link_elem = article.find('a', class_='WwrzSb')
|
||||
orglink = link_elem['href'] if link_elem else ''
|
||||
link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
|
||||
article_data = {
|
||||
'title': title,
|
||||
'source': source,
|
||||
'link': link,
|
||||
}
|
||||
array.append(article_data)
|
||||
except Exception as e:
|
||||
print(f"Error processing article: {e}")
|
||||
continue
|
||||
|
||||
sendData = {
|
||||
index: array
|
||||
}
|
||||
news_data.append(sendData)
|
||||
|
||||
with open('news.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(news_data, f, ensure_ascii=False, indent=2)
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue