mirror of
https://github.com/hpware/news-analyze.git
synced 2025-06-23 21:14:23 +00:00
Update some stuff & remove unneed code like python stuff, it is in the
web app.
This commit is contained in:
parent
bc9a63f6ab
commit
62fa31ae4a
24 changed files with 104 additions and 2580 deletions
52
README.md
52
README.md
|
@ -4,6 +4,8 @@
|
|||
|
||||
App Design: [PDF Document](/design.pdf)
|
||||
|
||||
Reverse engineering documentataion: [about](/about/)
|
||||
|
||||
## Before deploying, please know this:
|
||||
This code is absolutly NOT designed to be spinned up at Vercel or Netlify, it has the scraping system now inside of the main website code, oh also the entire "caching feature" is based in memory, so please don't use those platforms, for Zeabur your cost might be expensive. idk, I haven't tried it yet. The web url: https://news.yuanhau.com is hosted on my own infra, you should too. Please get a server off of yahoo 拍賣 or 蝦皮 to do so.
|
||||
|
||||
|
@ -30,14 +32,11 @@ This code is absolutly NOT designed to be spinned up at Vercel or Netlify, it ha
|
|||
- puter.com
|
||||
- Perplexity
|
||||
- Ground.news
|
||||
- 台灣新聞
|
||||
- Threads
|
||||
- Threads (政治方面)
|
||||
- xfce's Desktop Interface
|
||||
- juice website
|
||||
- MacOS
|
||||
- Windows XP style X - UI
|
||||
- Ghostty
|
||||
- Some random chat app embeded to most business websites
|
||||
- Treble's cool card effect (but not quite yet)
|
||||
|
||||
## Stack:
|
||||
|
@ -52,53 +51,20 @@ This code is absolutly NOT designed to be spinned up at Vercel or Netlify, it ha
|
|||
- BunJS
|
||||
- Groq
|
||||
- Custom Infra
|
||||
- Python
|
||||
- BeautifulSoup4
|
||||
- uv
|
||||
|
||||
## Folder Structure
|
||||
|
||||
```
|
||||
├── .github/
|
||||
│ └── workflows/
|
||||
├── components/
|
||||
│ ├── app/
|
||||
│ │ └── newsOrgAbout/
|
||||
│ └── ui/
|
||||
├── i18n/
|
||||
├── layouts/
|
||||
├── lib/
|
||||
├── pages/
|
||||
│ └── app/
|
||||
├── public/
|
||||
├── scraping/
|
||||
├── server/
|
||||
│ ├── api/
|
||||
│ │ └── objectstorage/
|
||||
│ ├── components/
|
||||
│ └── routes/
|
||||
├── styles/
|
||||
├── app.vue
|
||||
├── createDatabase.ts
|
||||
├── nuxt.config.ts
|
||||
├── package.json
|
||||
├── tailwind.config.js
|
||||
└── tsconfig.json
|
||||
```
|
||||
- Docker
|
||||
- Docker Compose
|
||||
|
||||
## 如何執行
|
||||
|
||||
1. First, rename `.env.example` to `.env` and fill in the blanks.
|
||||
2. Run `bun install` to install dependencies.
|
||||
3. Run `bun run createDatabase` to create the database.
|
||||
4. Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder.
|
||||
5. Run `bun run build` to build the project.
|
||||
6. Run `bun run preview` to start the preview server.
|
||||
7. Open `http://localhost:3000` in your browser.
|
||||
4. Run `bun run build` to build the project.
|
||||
5. Run `bun run preview` to start the preview server.
|
||||
6. Open `http://localhost:3000` in your browser.
|
||||
|
||||
### For scaping
|
||||
|
||||
First, Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder, then cd into the `scraping` folder. Run `python main.py` to start scraping in Google News.
|
||||
Scraping is embeded to the web app, please use the web app.
|
||||
|
||||
## 有問題? Got questions?
|
||||
Use this form: https://yhw.tw/SaBta
|
||||
|
|
|
@ -1,24 +1,32 @@
|
|||
# Scraping line today home
|
||||
# Scraping Line Today's home page system
|
||||
|
||||
This took me some time, but they use a fancy system for pulling news data.
|
||||
|
||||
## Endpoint on news.yuanhau.com aka this repo (Cached results)
|
||||
|
||||
### /api/home/uuid_lt/action?query=${query}
|
||||
Fetches the uuid in each listings of the query
|
||||
|
||||
### /api/home/lt/${query}
|
||||
Fetches the uuid and returns back with the news
|
||||
|
||||
## Main endpoint
|
||||
For local Taiwan news they use this url: https://today.line.me/_next/data/v1/tw/v3/tab/domestic.json?tabs=domestic
|
||||
|
||||
From the _next? I thought that is static? I mean it maybe is, it is just providing with the URLs that the client will be fetching to the server, which is a bit fun.
|
||||
From _next? I thought that is static? I mean it maybe is, it is just providing with the URLs that the client will be fetching to the server, which is a bit fun.
|
||||
|
||||
Here is a JSON snippet:
|
||||
```json
|
||||
{
|
||||
"id": "682b0cef1b1269f8dec93e60",
|
||||
"id": "the-news-id",
|
||||
"type": "HIGHLIGHT",
|
||||
"containerStyle": "Header",
|
||||
"name": "國內話題:新北重大車禍",
|
||||
"name": "國內話題:topic",
|
||||
"source": "LISTING",
|
||||
"header": {
|
||||
"title": "新北重大死傷車禍",
|
||||
"title": "the top title here",
|
||||
"hasCompositeTitle": false,
|
||||
"subTitle": "一輛小客車19日下午撞上放學人群,造成多名學童、大人送醫,至少3死10多傷,肇事的78歲男子當場昏迷。"
|
||||
"subTitle": "the news subtitle here"
|
||||
},
|
||||
"listings": [
|
||||
{
|
||||
|
@ -43,9 +51,9 @@ This api can be used for fetching the news from them, however, there is an issue
|
|||
And viewing the JSON, oh would you look at that.
|
||||
```JSON
|
||||
{
|
||||
"id": "262862833",
|
||||
"title": "派駐芬蘭遭白委扯焦慮症 林昶佐現身喊話",
|
||||
"publisher": "太報",
|
||||
"id": "news-id",
|
||||
"title": "news-title",
|
||||
"publisher": "news-publisher",
|
||||
"publisherId": "101366",
|
||||
"publishTimeUnix": 1747670221000,
|
||||
"contentType": "GENERAL",
|
||||
|
@ -58,7 +66,7 @@ And viewing the JSON, oh would you look at that.
|
|||
},
|
||||
"categoryId": 100262,
|
||||
"categoryName": "國內",
|
||||
"shortDescription": "前立委林昶佐(右二)將出任駐芬蘭代表,民眾黨立委林憶君卻質疑罹患焦慮症不適合去北歐。翻攝畫面前立委林昶佐將接任駐芬蘭代表,民眾黨立委林憶君今(5/19)質詢指出,林林昶佐曾患焦慮症,北歐國家日常短,病症容易發作,質疑是否適合。林昶佐晚間現身直播節目,向病友喊話,要對自己有信心,「絕對可以回復到正常生活,包括工作」。林憶君指出,1990年芬蘭是全球自殺率最高國家,而且北歐國家的日照很短,病症容易發作..."
|
||||
"shortDescription": "The article's short description"
|
||||
},
|
||||
```
|
||||
The url hash is just what we needed to use my scraper :D
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Copy-Item "./.env" -Destination "./scraping/.env"
|
|
@ -1 +0,0 @@
|
|||
cp ./.env scraping/.env
|
|
@ -1 +0,0 @@
|
|||
3.13
|
|
@ -1,3 +0,0 @@
|
|||
# Scraping
|
||||
|
||||
This file contains the code for scraping the news from websites. And storing the data into the a postgres database.
|
|
@ -1,31 +0,0 @@
|
|||
# Status
|
||||
|
||||
## setn.py
|
||||
|
||||
Working
|
||||
|
||||
## tvbs.py
|
||||
|
||||
Working
|
||||
|
||||
## taisounds.py
|
||||
|
||||
Working
|
||||
|
||||
## cna.py
|
||||
|
||||
Broken
|
||||
|
||||
Error: `Error: 'utf-8' codec can't decode byte 0x83 in position 0: invalid start byte`
|
||||
|
||||
## chinatimes.py
|
||||
|
||||
Broken
|
||||
|
||||
Error: `Error: 'utf-8' codec can't decode byte 0xa3 in position 0: invalid start byte`
|
||||
|
||||
## twreporter.py
|
||||
|
||||
Broken
|
||||
|
||||
Error: `Error: 'utf-8' codec can't decode byte 0xc0 in position 2: invalid start byte`
|
|
@ -1,56 +0,0 @@
|
|||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
url = "https://www.chinatimes.com/realtimenews/20250511002798-260407?chdtv"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
title = soup.find('h1', class_='article-title')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('div', class_="article-body")
|
||||
content = article.text.strip() if article else "No content found"
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {content}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
if 'soup' in locals():
|
||||
print("\nAvailable classes in HTML:")
|
||||
for tag in soup.find_all(class_=True):
|
||||
print(f"Tag: {tag.name}, Class: {tag['class']}")
|
|
@ -1,56 +0,0 @@
|
|||
# BROKEN
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
url = "https://www.cna.com.tw/news/aspt/202505110112.aspx"
|
||||
paragraph_css = "paragraph"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract content
|
||||
title = soup.find('h1').text.strip() if soup.find('h1') else ""
|
||||
article = soup.find('div', class_=paragraph_css)
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
|
@ -1,58 +0,0 @@
|
|||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
url = "https://www.setn.com/News.aspx?NewsID=1654352"
|
||||
paragraph_css = "article"
|
||||
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
title = soup.find('h1', class_='news-title-3')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('article')
|
||||
content = article.text.strip() if article else "No content found"
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {content}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
if 'soup' in locals():
|
||||
print("\nAvailable classes in HTML:")
|
||||
for tag in soup.find_all(class_=True):
|
||||
print(f"Tag: {tag.name}, Class: {tag['class']}")
|
|
@ -1,61 +0,0 @@
|
|||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
url = "https://www.taisounds.com/news/content/84/189872"
|
||||
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
title = soup.find('h1')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
#author = soup.find('div', class_='publish')
|
||||
#author_text = author.text.strip().soup.find('a').text.strip() if author else "No author found"
|
||||
|
||||
article = soup.find('div', class_='news-box-text')
|
||||
content = article.text.strip() if article else "No content found"
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
#print(f"Author: {author_text}")
|
||||
print(f"Content: {content}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
if 'soup' in locals():
|
||||
print("\nAvailable classes in HTML:")
|
||||
for tag in soup.find_all(class_=True):
|
||||
print(f"Tag: {tag.name}, Class: {tag['class']}")
|
|
@ -1,57 +0,0 @@
|
|||
# BROKEN
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
url = "https://news.tvbs.com.tw/politics/2866915"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract content
|
||||
title = soup.find('h1', class_='title')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('div', class_="article_content")
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
|
@ -1,57 +0,0 @@
|
|||
# BROKEN
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
url = "https://www.twreporter.org/a/olena-yagupova-kidnapped-by-russian-soldiers"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract content
|
||||
title = soup.find('div', class_=r'headline__DefaultContainer.*?')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('div', class_=r"article-page__ContentBlock.*?")
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
|
@ -1,78 +0,0 @@
|
|||
import re
|
||||
from urllib.request import urlopen # URL request lib.
|
||||
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import uuid
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
# Connect to PostgresDB
|
||||
conn = psycopg2.connect(database=os.getenv("POSTGRES_DB"),
|
||||
user=os.getenv("POSTGRES_USER"),
|
||||
password=os.getenv("POSTGRES_PASSWORD"),
|
||||
host=os.getenv("POSTGRES_HOST"),
|
||||
port=os.getenv("POSTGRES_PORT")
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
|
||||
}
|
||||
|
||||
url = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFptTXpJU0JYcG9MVlJYS0FBUAE?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant"
|
||||
|
||||
topiccwiz_css = "PO9Zff Ccj79 kUVvS"
|
||||
page = urlopen(url)
|
||||
html_bytes = page.read()
|
||||
html = html_bytes.decode("utf-8")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Find the topiccwiz element
|
||||
topiccwiz = soup.find_all("c-wiz", class_=topiccwiz_css)
|
||||
news_data = []
|
||||
index = 0
|
||||
for item in topiccwiz:
|
||||
index+= 1
|
||||
array = []
|
||||
hotarticles = item.find_all("article")
|
||||
first_article = ""
|
||||
passed_first = False
|
||||
catagory_uuid = str(uuid.uuid4())
|
||||
for article in hotarticles:
|
||||
article_uuid = str(uuid.uuid4())
|
||||
try:
|
||||
title_elem = article.find('a', class_='gPFEn')
|
||||
title = title_elem.text.strip() if title_elem else ''
|
||||
|
||||
source_elem = article.find('div', class_='vr1PYe')
|
||||
source = source_elem.text.strip() if source_elem else ''
|
||||
link_elem = article.find('a', class_='WwrzSb')
|
||||
orglink = link_elem['href'] if link_elem else ''
|
||||
link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
|
||||
cur.execute("""
|
||||
insert into hot_news (uuid, title, news_org, link, related_uuid)
|
||||
values (%s, %s, %s, %s, %s)
|
||||
""", (article_uuid, title, source, link, catagory_uuid))
|
||||
article_data = {
|
||||
"uuid": article_uuid,
|
||||
"title": title,
|
||||
"news_org": source,
|
||||
"link": link,
|
||||
"related_uuid": catagory_uuid
|
||||
}
|
||||
news_data.append(article_data)
|
||||
except Exception as e:
|
||||
print(f"Error processing article: {e}")
|
||||
continue
|
||||
|
||||
with open('hotnews_data.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(news_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
File diff suppressed because it is too large
Load diff
|
@ -1,16 +0,0 @@
|
|||
English Name,Chinese Name,Website,Type,Description
|
||||
The Reporter,報導者,https://www.twreporter.org/,Online/Investigative,Non-profit investigative journalism platform.
|
||||
Taisounds,太報,https://www.taisounds.com/,Online,Independent online news outlet.
|
||||
Storm Media,風傳媒,https://www.storm.mg/,Online,Online news and commentary, known for investigative journalism.
|
||||
Mirror Media,鏡週刊,https://www.mirrormedia.mg/,Online/Weekly,Online news and weekly magazine, known for exclusive stories.
|
||||
Newtalk,新頭殼,https://newtalk.tw/,Online,Online news, focuses on politics and social issues.
|
||||
CommonWealth Magazine,天下雜誌,https://www.cw.com.tw/,Magazine/Online,Leading business and current affairs magazine.
|
||||
Initium Media,端傳媒,https://theinitium.com/,Online,Independent Chinese-language news platform (Taiwan and Hong Kong focus).
|
||||
United Daily News,聯合新聞網,https://udn.com/,Newspaper/Online,Major daily newspaper and online news portal.
|
||||
Liberty Times,自由時報,https://news.ltn.com.tw/,Newspaper/Online,Influential daily newspaper, pro-independence stance.
|
||||
China Times,中時新聞網,https://www.chinatimes.com/,Newspaper/Online,One of Taiwan’s oldest newspapers, large online presence.
|
||||
SET News,三立新聞網,https://www.setn.com/,TV/Online,Major TV and online news provider.
|
||||
ETtoday,東森新聞雲,https://www.ettoday.net/news/,Online/TV,Large online news portal, also operates a TV channel.
|
||||
Apple Online (TW),蘋果新聞網,https://tw.nextapple.com/,Online/Tabloid,Successor to Apple Daily, popular online news/tabloid.
|
||||
TVBS News,TVBS新聞網,https://news.tvbs.com.tw/,TV/Online,Major TV news channel with strong online presence.
|
||||
Formosa TV News,民視新聞網,https://www.ftvnews.com.tw/,TV/Online,Major TV news channel, also online.
|
|
|
@ -1,67 +0,0 @@
|
|||
# THIS WORKS :D
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="A LINE Today Scraper.")
|
||||
parser.add_argument("-s", "--slug", type=str, help="The article URL like: oqmazXP")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.slug:
|
||||
print("No Slug entered, please use -s oqmazXP as a demo.")
|
||||
exit(1)
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze) (A little note: If you see this, It means that your website is being scraped by other people, not the user hpware. Please keep that in mind at don't spam issues, I can't fix it.)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
url = "https://today.line.me/tw/v2/article/" + args.slug
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
print("GZIP")
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
print("Not GZIP")
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract content
|
||||
title = soup.find('h1', class_="entityTitle")
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('article', class_="news-content")
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
|
@ -1,57 +0,0 @@
|
|||
# BROKEN
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
url = "https://www.msn.com/zh-tw/news/living/%E7%99%BD%E5%A4%A9-%E6%99%9A%E4%B8%8A%E9%81%8B%E5%8B%95%E5%93%AA%E5%80%8B%E5%A5%BD-%E9%86%AB%E6%8F%AD-1%E9%97%9C%E9%8D%B5-%E6%AF%94%E6%8C%91%E6%99%82%E9%96%93%E6%9B%B4%E9%87%8D%E8%A6%81/ar-AA1D4zTQ"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soupsoup = BeautifulSoup(html, "html.parser")
|
||||
soup = soupsoup.find('views-header-wc')
|
||||
# Extract content
|
||||
title = soup.find('h1', class_='viewsHeaderText')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('body', class_="article-body")
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
|
@ -1,57 +0,0 @@
|
|||
# BROKEN
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
url = "https://tw.news.yahoo.com/%E5%93%BD%E5%92%BD%E7%A8%B1%E8%80%81%E5%90%8C%E5%AD%B8%E6%9D%8E%E6%96%87%E5%AE%97-%E8%A2%AB%E5%86%A4%E7%8D%84-%E6%9F%AF%E6%96%87%E5%93%B2-%E4%BD%A0%E5%80%91%E5%8F%AA%E6%98%AF%E8%A6%81%E6%8A%BC%E6%88%91-%E5%85%B6%E4%BB%96%E5%85%88%E6%94%BE%E8%B5%B0-122535612.html"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract content
|
||||
title = soup.find('h1', attrs={"data-test-locator": "headline"})
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('div', class_="caas-body")
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
|
@ -1,7 +0,0 @@
|
|||
[project]
|
||||
name = "scraping"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = []
|
|
@ -1,6 +0,0 @@
|
|||
urlopen
|
||||
beautifulsoup4
|
||||
psycopg2-binary
|
||||
pandas
|
||||
dotenv
|
||||
chardet
|
8
scraping/uv.lock
generated
8
scraping/uv.lock
generated
|
@ -1,8 +0,0 @@
|
|||
version = 1
|
||||
revision = 1
|
||||
requires-python = ">=3.13"
|
||||
|
||||
[[package]]
|
||||
name = "scraping"
|
||||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
|
@ -1,10 +1,72 @@
|
|||
// Check /about/scraping_line_today_home.md for more info or https://news.yuanhau.com/datainfo/linetodayjsondata.json
|
||||
interface CacheItem {
|
||||
data: string[];
|
||||
timestamp: number;
|
||||
}
|
||||
const cache: Record<string, CacheItem> = {};
|
||||
const CACHE_DURATION = 1000 * 60 * 60; // 1 Hour
|
||||
|
||||
async function getUUID(orgtype: string) {
|
||||
const type = orgtype.toLowerCase();
|
||||
if (cache[type] && Date.now() - cache[type].timestamp < CACHE_DURATION) {
|
||||
console.log("Serving from cache for type:", type);
|
||||
return cache[type].data;
|
||||
}
|
||||
|
||||
try {
|
||||
const buildUrl = `https://today.line.me/_next/data/v1/tw/v3/tab/${type}.json?tabs=${type}`;
|
||||
const req = await fetch(buildUrl, {
|
||||
headers: {
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
Accept: "application/json",
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
},
|
||||
});
|
||||
const res = await req.json();
|
||||
const req2 = res.pageProps.fallback[`getPageData,${type}`].modules;
|
||||
const req3 = [];
|
||||
req2.forEach((key) => {
|
||||
const listings = key.listings;
|
||||
if (Array.isArray(listings)) {
|
||||
listings.forEach((listing) => {
|
||||
if (listing && listing.id) {
|
||||
req3.push(listing.id);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
cache[type] = {
|
||||
data: req3,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
return req3;
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
if (cache[type]) {
|
||||
console.log("Serving expired cache due to error");
|
||||
return cache[type].data;
|
||||
}
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function filterUUIDs(ids: string[]): string[] {
|
||||
const uuidPattern =
|
||||
/^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i;
|
||||
return ids.filter((id) => uuidPattern.test(id));
|
||||
}
|
||||
|
||||
export default defineEventHandler(async (event) => {
|
||||
try {
|
||||
const data = await getUUID(String(query.query));
|
||||
const validUUIDs = filterUUIDs(data || []);
|
||||
const slug = getRouterParam(event, "slug");
|
||||
const urlBuild = "/api/home/uuid_lt/" + slug;
|
||||
const urlBuild = "/api/home/uuid_lt/action?query=" + String(slug.trim());
|
||||
const articleArray = [];
|
||||
const req = await fetch(urlBuild);
|
||||
const res = await req.text();
|
||||
const { data: res } = await useFetch(urlBuild);
|
||||
return res;
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
|
|
|
@ -6,7 +6,8 @@ interface CacheItem {
|
|||
const cache: Record<string, CacheItem> = {};
|
||||
const CACHE_DURATION = 1000 * 60 * 60; // 1 Hour
|
||||
|
||||
async function getLineTodayData(type: string) {
|
||||
async function getLineTodayData(orgtype: string) {
|
||||
const type = orgtype.toLowerCase();
|
||||
if (cache[type] && Date.now() - cache[type].timestamp < CACHE_DURATION) {
|
||||
console.log("Serving from cache for type:", type);
|
||||
return cache[type].data;
|
||||
|
@ -23,7 +24,7 @@ async function getLineTodayData(type: string) {
|
|||
},
|
||||
});
|
||||
const res = await req.json();
|
||||
const req2 = res.pageProps.fallback["getPageData,domestic"].modules;
|
||||
const req2 = res.pageProps.fallback[`getPageData,${type}`].modules;
|
||||
const req3 = [];
|
||||
req2.forEach((key) => {
|
||||
const listings = key.listings;
|
||||
|
@ -67,8 +68,16 @@ export default defineEventHandler(async (event) => {
|
|||
}
|
||||
const data = await getLineTodayData(String(query.query));
|
||||
const validUUIDs = filterUUIDs(data || []);
|
||||
const noDup = [];
|
||||
validUUIDs.forEach((key) => {
|
||||
if (noDup.includes(key)) {
|
||||
return;
|
||||
} else {
|
||||
noDup.push(key);
|
||||
}
|
||||
});
|
||||
return {
|
||||
data: validUUIDs,
|
||||
data: noDup,
|
||||
cached: !!cache[String(query.query)],
|
||||
};
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue