feat: implement DraggableWindow component and update index.vue layout; enhance scraping scripts for better error handling and content extraction

2025-06-23 21:14:23 +00:00 · 2025-05-12 00:39:28 +08:00 · 2025-05-12 00:39:28 +08:00 · b461e81360
commit b461e81360
parent 8b07d4b3be
8 changed files with 522 additions and 174 deletions
--- a/scraping/findText/README.md
+++ b/scraping/findText/README.md
@ -1,7 +1,25 @@
 # Status

-## cna.py
-Not working
-
 ## setn.py 
-Working
+Working
+
+## tvbs.py
+Working
+
+## taisounds.py
+Working
+
+## cna.py
+Broken
+
+Error: `Error: 'utf-8' codec can't decode byte 0x83 in position 0: invalid start byte`
+
+## chinatimes.py
+Broken
+
+Error: `Error: 'utf-8' codec can't decode byte 0xa3 in position 0: invalid start byte`
+
+## twreporter.py
+Broken
+
+Error: `Error: 'utf-8' codec can't decode byte 0xc0 in position 2: invalid start byte`
--- a/scraping/findText/chinatimes.py
+++ b/scraping/findText/chinatimes.py
@ -0,0 +1,56 @@
+import re
+from urllib.request import urlopen, Request
+import chardet
+from bs4 import BeautifulSoup
+import json
+import psycopg2
+import pandas as pd
+import dotenv
+import os
+import gzip
+import io
+
+dotenv.load_dotenv()
+
+headers = {
+    #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Accept': '*',
+    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Connection': 'keep-alive',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Cache-Control': 'max-age=0',
+}
+
+url = "https://www.chinatimes.com/realtimenews/20250511002798-260407?chdtv"
+
+try:
+    req = Request(url, headers=headers)
+    response = urlopen(req)
+    if response.info().get('Content-Encoding') == 'gzip':
+        gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
+        html = gzip_file.read().decode('utf-8')
+    else:
+        html = response.read().decode('utf-8')
+    
+    soup = BeautifulSoup(html, "html.parser")
+    
+    title = soup.find('h1', class_='article-title') 
+    title_text = title.text.strip() if title else "No title found"
+    
+    article = soup.find('div', class_="article-body") 
+    content = article.text.strip() if article else "No content found"
+    
+    # Print results
+    print(f"Title: {title_text}")
+    print(f"Content: {content}")
+    
+except Exception as e:
+    print(f"Error: {str(e)}")
+    if 'soup' in locals():
+        print("\nAvailable classes in HTML:")
+        for tag in soup.find_all(class_=True):
+            print(f"Tag: {tag.name}, Class: {tag['class']}")
--- a/scraping/findText/taisounds.py
+++ b/scraping/findText/taisounds.py
@ -0,0 +1,61 @@
+import re
+from urllib.request import urlopen, Request
+import chardet
+from bs4 import BeautifulSoup
+import json
+import psycopg2
+import pandas as pd
+import dotenv
+import os
+import gzip
+import io
+
+dotenv.load_dotenv()
+
+headers = {
+    #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Accept': '*',
+    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Connection': 'keep-alive',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Cache-Control': 'max-age=0',
+}
+
+url = "https://www.taisounds.com/news/content/84/189872"
+
+
+try:
+    req = Request(url, headers=headers)
+    response = urlopen(req)
+    if response.info().get('Content-Encoding') == 'gzip':
+        gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
+        html = gzip_file.read().decode('utf-8')
+    else:
+        html = response.read().decode('utf-8')
+    
+    soup = BeautifulSoup(html, "html.parser")
+    
+    title = soup.find('h1') 
+    title_text = title.text.strip() if title else "No title found"
+    
+    #author = soup.find('div', class_='publish')
+    #author_text = author.text.strip().soup.find('a').text.strip() if author else "No author found"
+
+    article = soup.find('div', class_='news-box-text') 
+    content = article.text.strip() if article else "No content found"
+    
+    # Print results
+    print(f"Title: {title_text}")
+    #print(f"Author: {author_text}")
+    print(f"Content: {content}")
+    
+except Exception as e:
+    print(f"Error: {str(e)}")
+    if 'soup' in locals():
+        print("\nAvailable classes in HTML:")
+        for tag in soup.find_all(class_=True):
+            print(f"Tag: {tag.name}, Class: {tag['class']}")
--- a/scraping/findText/tvbs.py
+++ b/scraping/findText/tvbs.py
@ -0,0 +1,57 @@
+# BROKEN
+import re
+from urllib.request import urlopen, Request
+import chardet
+from bs4 import BeautifulSoup
+import json
+import psycopg2
+import pandas as pd
+import dotenv
+import os
+import gzip
+import io
+
+# Load environment variables from .env file
+dotenv.load_dotenv()
+
+headers = {
+    #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Accept': '*',
+    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Connection': 'keep-alive',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Cache-Control': 'max-age=0',
+}
+
+
+url = "https://news.tvbs.com.tw/politics/2866915"
+
+try:
+    req = Request(url, headers=headers)
+    response = urlopen(req)
+    if response.info().get('Content-Encoding') == 'gzip':
+        gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
+        html = gzip_file.read().decode('utf-8')
+    else:
+        html = response.read().decode('utf-8')
+    
+    
+    soup = BeautifulSoup(html, "html.parser")
+    
+    # Extract content
+    title = soup.find('h1', class_='title') 
+    title_text = title.text.strip() if title else "No title found"
+
+    article = soup.find('div', class_="article_content")
+    paragraph = article.text.strip() if article else ""
+    
+    # Print results
+    print(f"Title: {title_text}")
+    print(f"Content: {paragraph}")
+    
+except Exception as e:
+    print(f"Error: {str(e)}")
--- a/scraping/findText/twreporter.py
+++ b/scraping/findText/twreporter.py
@ -0,0 +1,57 @@
+# BROKEN
+import re
+from urllib.request import urlopen, Request
+import chardet
+from bs4 import BeautifulSoup
+import json
+import psycopg2
+import pandas as pd
+import dotenv
+import os
+import gzip
+import io
+
+# Load environment variables from .env file
+dotenv.load_dotenv()
+
+headers = {
+    #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Accept': '*',
+    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Connection': 'keep-alive',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Cache-Control': 'max-age=0',
+}
+
+
+url = "https://www.twreporter.org/a/olena-yagupova-kidnapped-by-russian-soldiers"
+
+try:
+    req = Request(url, headers=headers)
+    response = urlopen(req)
+    if response.info().get('Content-Encoding') == 'gzip':
+        gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
+        html = gzip_file.read().decode('utf-8')
+    else:
+        html = response.read().decode('utf-8')
+    
+    
+    soup = BeautifulSoup(html, "html.parser")
+    
+    # Extract content
+    title = soup.find('div', class_=r'headline__DefaultContainer.*?') 
+    title_text = title.text.strip() if title else "No title found"
+
+    article = soup.find('div', class_=r"article-page__ContentBlock.*?")
+    paragraph = article.text.strip() if article else ""
+    
+    # Print results
+    print(f"Title: {title_text}")
+    print(f"Content: {paragraph}")
+    
+except Exception as e:
+    print(f"Error: {str(e)}")