feat: update README with folder structure and execution instructions; add scraping scripts for news articles

2025-06-24 05:24:23 +00:00 · 2025-05-11 17:31:37 +08:00 · 2025-05-11 17:31:37 +08:00 · f8fa412de9
commit f8fa412de9
parent c68606ffbe
7 changed files with 188 additions and 41 deletions
--- a/scraping/main.py
+++ b/scraping/main.py
@ -1,73 +0,0 @@
-# Going to love the stupid tab tab tab tab system, instead of using brances.
-import re # Regular expressions
-from urllib.request import urlopen # URL request lib.
-from bs4 import BeautifulSoup # BeautifulSoup lib.
-import json
-import psycopg2
-import pandas as pd
-import dotenv
-import os
-
-# Load environment variables from .env file
-dotenv.load_dotenv()
-
-
-# Connect to PostgresDB
-#conn = psycopg2.connect(database=os.getenv("POSTGRES_DB"), 
-#                        user=os.getenv("POSTGRES_USER"),
-#                        password=os.getenv("POSTGRES_PASSWORD"),
-#                        host=os.getenv("POSTGRES_HOST"),
-#                        port=os.getenv("POSTGRES_PORT")
-#)
-
-headers = {
-    'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
-}
-
-#url = "https://tw.news.yahoo.com/"
-#url = "https://news.google.com/home?hl=zh-TW&gl=TW&ceid=TW:zh-Hant"
-url = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFptTXpJU0JYcG9MVlJYS0FBUAE?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant"
-
-topiccwiz_css = "PO9Zff Ccj79 kUVvS"
-page = urlopen(url)
-html_bytes = page.read()
-html = html_bytes.decode("utf-8")
-soup = BeautifulSoup(html, "html.parser")
-# Find the topiccwiz element
-topiccwiz = soup.find_all("c-wiz", class_=topiccwiz_css)
-news_data = []
-index = 0
-for item in topiccwiz:
-    index+= 1
-    array = []
-    hotarticles = item.find_all("article")
-    first_article = ""
-    passed_first = False
-    for article in hotarticles:
-        try:
-            title_elem = article.find('a', class_='gPFEn')
-            title = title_elem.text.strip() if title_elem else ''
-            
-            source_elem = article.find('div', class_='vr1PYe')
-            source = source_elem.text.strip() if source_elem else ''
-            link_elem = article.find('a', class_='WwrzSb')
-            orglink = link_elem['href'] if link_elem else ''
-            link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
-            article_data = {
-                'title': title,
-                'source': source,
-                'link': link,
-            }
-            array.append(article_data)
-        except Exception as e:
-            print(f"Error processing article: {e}")
-            continue
-
-    sendData = {
-        index: array
-    }
-    news_data.append(sendData)
-
-with open('news.json', 'w', encoding='utf-8') as f:
-    json.dump(news_data, f, ensure_ascii=False, indent=2)
-