From 5b47d2d3bc75c659717a48b7b7179733bf34b0f4 Mon Sep 17 00:00:00 2001
From: LiuShen <3162475700@qq.com>
Date: Sat, 3 May 2025 15:45:52 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=97=E6=9B=B4=E5=8A=A0=E5=90=88?=
 =?UTF-8?q?=E7=90=86=E7=9A=84=E8=AF=B7=E6=B1=82=E5=A4=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 friend_circle_lite/get_info.py       |  58 ++++++++--
 rss_subscribe/push_article_update.py |  13 ++-
 run.py                               | 165 +++++++++++++++++----------
 3 files changed, 163 insertions(+), 73 deletions(-)

diff --git a/friend_circle_lite/get_info.py b/friend_circle_lite/get_info.py
index 5127c1c..ea43301 100644
--- a/friend_circle_lite/get_info.py
+++ b/friend_circle_lite/get_info.py
@@ -1,14 +1,36 @@
 import logging
 from datetime import datetime, timedelta, timezone
+import re
+from typing import Any
+from urllib.parse import urljoin, urlparse
 from dateutil import parser
 import requests
-import re
 import feedparser
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 # 标准化的请求头
-headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
+HEADERS_JSON = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/123.0.0.0 Safari/537.36 "
+        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
+    ),
+    "X-Friend-Circle": "1.0"
+}
+
+HEADERS_XML = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/123.0.0.0 Safari/537.36 "
+        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
+    ),
+    "Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "X-Friend-Circle": "1.0"
 }
 
 timeout = (10, 15) # 连接超时和读取超时，防止requests接受时间过长
@@ -86,7 +108,7 @@ def check_feed(blog_url, session):
     for feed_type, path in possible_feeds:
         feed_url = blog_url.rstrip('/') + path
         try:
-            response = session.get(feed_url, headers=headers, timeout=timeout)
+            response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout)
             if response.status_code == 200:
                 return [feed_type, feed_url]
         except requests.RequestException:
@@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''):
     dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
     """
     try:
-        response = session.get(url, headers=headers, timeout=timeout)
+        response = session.get(url, headers=HEADERS_XML, timeout=timeout)
         response.encoding = response.apparent_encoding or 'utf-8'
         feed = feedparser.parse(response.text)
         
         result = {
-            'website_name': feed.feed.title if 'title' in feed.feed else '',
-            'author': feed.feed.author if 'author' in feed.feed else '',
-            'link': feed.feed.link if 'link' in feed.feed else '',
+            'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore
+            'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore
+            'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore
             'articles': []
         }
         
@@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''):
                 logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
             
             # 处理链接中可能存在的错误，比如ip或localhost
-            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
+            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore
             
             article = {
                 'title': entry.title if 'title' in entry else '',
@@ -177,7 +199,19 @@ def replace_non_domain(link: str, blog_url: str) -> str:
     # path = re.sub(r'^https?://[^/]+', '', link)
     # print(path)
     
-    return link
+    try:
+        parsed = urlparse(link)
+        if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc):  # IP地址或localhost
+            # 提取 path + query
+            path = parsed.path or '/'
+            if parsed.query:
+                path += '?' + parsed.query
+            return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/'))
+        else:
+            return link  # 合法域名则返回原链接
+    except Exception as e:
+        logging.warning(f"替换链接时出错：{link}, error: {e}")
+        return link
 
 def process_friend(friend, session, count, specific_RSS=[]):
     """
@@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5):
     session = requests.Session()
     
     try:
-        response = session.get(json_url, headers=headers, timeout=timeout)
+        response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout)
         friends_data = response.json()
     except Exception as e:
         logging.error(f"无法获取链接：{json_url} ：{e}", exc_info=True)
@@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url):
     dict: 合并后的文章信息字典，已去重处理
     """
     try:
-        response = requests.get(marge_json_url, headers=headers, timeout=timeout)
+        response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout)
         marge_data = response.json()
     except Exception as e:
         logging.error(f"无法获取链接：{marge_json_url}，出现的问题为：{e}", exc_info=True)
diff --git a/rss_subscribe/push_article_update.py b/rss_subscribe/push_article_update.py
index df3ccec..d43f1ad 100644
--- a/rss_subscribe/push_article_update.py
+++ b/rss_subscribe/push_article_update.py
@@ -6,10 +6,17 @@ import json
 import os
 
 # 标准化的请求头
-headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+HEADERS_JSON = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/123.0.0.0 Safari/537.36 "
+        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
+    ),
+    "X-Friend-Circle": "1.0"
 }
 
+
 def extract_emails_from_issues(api_url):
     """
     从GitHub issues API中提取以[e-mail]开头的title中的邮箱地址。
@@ -26,7 +33,7 @@ def extract_emails_from_issues(api_url):
     }
     """
     try:
-        response = requests.get(api_url, headers=headers)
+        response = requests.get(api_url, headers=HEADERS_JSON, timeout=10)
         response.raise_for_status()
         issues = response.json()
     except Exception as e:
diff --git a/run.py b/run.py
index 29f7c34..df21df6 100644
--- a/run.py
+++ b/run.py
@@ -1,95 +1,136 @@
-# 引入 check_feed 和 parse_feed 函数
-from friend_circle_lite.get_info import fetch_and_process_data, sort_articles_by_time, marge_data_from_json_url, marge_errors_from_json_url, deal_with_large_data
-from friend_circle_lite.get_conf import load_config
-from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues
-from push_rss_update.send_email import send_emails
-
 import logging
 import json
 import sys
 import os
 
-# 日志记录
-logging.basicConfig(level=logging.INFO, format='😋 %(levelname)s: %(message)s')
+from friend_circle_lite.get_info import (
+    fetch_and_process_data,
+    marge_data_from_json_url,
+    marge_errors_from_json_url,
+    deal_with_large_data
+)
+from friend_circle_lite.get_conf import load_config
+from rss_subscribe.push_article_update import (
+    get_latest_articles_from_link,
+    extract_emails_from_issues
+)
+from push_rss_update.send_email import send_emails
 
+# ========== 日志设置 ==========
+logging.basicConfig(
+    level=logging.INFO,
+    format='😋 %(levelname)s: %(message)s'
+)
 
-# 爬虫部分内容
+# ========== 加载配置 ==========
 config = load_config("./conf.yaml")
+
+# ========== 爬虫模块 ==========
 if config["spider_settings"]["enable"]:
-    logging.info("爬虫已启用")
+    logging.info("✅ 爬虫已启用")
+
     json_url = config['spider_settings']['json_url']
     article_count = config['spider_settings']['article_count']
-    specific_RSS = config['specific_RSS']
-    logging.info("正在从 {json_url} 中获取，每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
-    result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
+    specific_rss = config['specific_RSS']
+
+    logging.info(f"📥 正在从 {json_url} 获取数据，每个博客获取 {article_count} 篇文章")
+    result, lost_friends = fetch_and_process_data(
+        json_url=json_url,
+        specific_RSS=specific_rss,
+        count=article_count
+    ) # type: ignore
+
     if config["spider_settings"]["merge_result"]["enable"]:
-        marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
-        logging.info("合并数据功能开启，从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
-        result = marge_data_from_json_url(result, marge_json_url + "/all.json")
-        lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json")
-    logging.info("数据获取完毕，目前共有 {count} 位好友的动态，正在处理数据".format(count=len(result.get("article_data", []))))
+        merge_url = config['spider_settings']["merge_result"]['merge_json_url']
+        logging.info(f"🔀 合并功能开启，从 {merge_url} 获取外部数据")
+
+        result = marge_data_from_json_url(result, f"{merge_url}/all.json")
+        lost_friends = marge_errors_from_json_url(lost_friends, f"{merge_url}/errors.json")
+
+    article_count = len(result.get("article_data", []))
+    logging.info(f"📦 数据获取完毕，共有 {article_count} 位好友的动态，正在处理数据")
+
     result = deal_with_large_data(result)
 
     with open("all.json", "w", encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
+
     with open("errors.json", "w", encoding="utf-8") as f:
         json.dump(lost_friends, f, ensure_ascii=False, indent=2)
 
+# ========== 推送准备 ==========
 if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
-    logging.info("推送功能已启用，正在准备推送，获取配置信息")
-    email_settings = config["smtp"]
-    email = email_settings["email"]
-    server = email_settings["server"]
-    port = email_settings["port"]
-    use_tls = email_settings["use_tls"]
+    logging.info("📨 推送功能已启用，正在准备中...")
+
+    smtp_conf = config["smtp"]
+    sender_email = smtp_conf["email"]
+    server = smtp_conf["server"]
+    port = smtp_conf["port"]
+    use_tls = smtp_conf["use_tls"]
     password = os.getenv("SMTP_PWD")
-    logging.info("SMTP 服务器信息：{server}:{port}".format(server=server, port=port))
-    logging.info("密码：{pwd}************".format(pwd=password[:3]))
 
+    logging.info(f"📡 SMTP 服务器：{server}:{port}")
+    if not password:
+        logging.error("❌ 环境变量 SMTP_PWD 未设置，无法发送邮件")
+        sys.exit(1)
+    else:
+        logging.info(f"🔐 密码(部分)：{password[:3]}*****")
+
+# ========== 邮件推送（待实现）==========
 if config["email_push"]["enable"]:
-    logging.info("邮件推送已启用")
-    logging.info("抱歉，目前暂未实现功能")
-    
-if config["rss_subscribe"]["enable"]:
-    logging.info("RSS 订阅推送已启用")
-    # 获取并强制转换为字符串
-    # 尝试从环境变量获取 FCL_REPO
-    fcl_repo = os.getenv('FCL_REPO')
+    logging.info("📧 邮件推送已启用")
+    logging.info("⚠️ 抱歉，目前尚未实现邮件推送功能")
 
-    # 提取 github_username 和 github_repo
+# ========== RSS 订阅推送 ==========
+if config["rss_subscribe"]["enable"]:
+    logging.info("📰 RSS 订阅推送已启用")
+
+    smtp_conf = config["smtp"]
+    sender_email = smtp_conf["email"]
+    server = smtp_conf["server"]
+    port = smtp_conf["port"]
+    use_tls = smtp_conf["use_tls"]
+    password = os.getenv("SMTP_PWD")
+
+    # 获取 GitHub 仓库信息
+    fcl_repo = os.getenv('FCL_REPO')
     if fcl_repo:
         github_username, github_repo = fcl_repo.split('/')
     else:
         github_username = str(config["rss_subscribe"]["github_username"]).strip()
         github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
-    
-    # 输出 github_username 和 github_repo
-    logging.info("github_username: {github_username}".format(github_username=github_username))
-    logging.info("github_repo: {github_repo}".format(github_repo=github_repo))
-    
+
+    logging.info(f"👤 GitHub 用户名：{github_username}")
+    logging.info(f"📁 GitHub 仓库：{github_repo}")
+
     your_blog_url = config["rss_subscribe"]["your_blog_url"]
     email_template = config["rss_subscribe"]["email_template"]
-    # 获取网站信息
     website_title = config["rss_subscribe"]["website_info"]["title"]
-    # 获取最近更新的文章
+
     latest_articles = get_latest_articles_from_link(
         url=your_blog_url,
         count=5,
         last_articles_path="./rss_subscribe/last_articles.json"
-        )
-    logging.info("获取到的最新文章为：{latest_articles}".format(latest_articles=latest_articles))
-    if latest_articles == None:
-        logging.info("无未进行推送的新文章")
+    )
+
+    if not latest_articles:
+        logging.info("📭 无新文章，无需推送")
     else:
-        github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200"
-        logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
+        logging.info(f"🆕 获取到的最新文章：{latest_articles}")
+
+        github_api_url = (
+            f"https://api.github.com/repos/{github_username}/{github_repo}/issues"
+            f"?state=closed&label=subscribed&per_page=200"
+        )
+        logging.info(f"🔎 正在从 GitHub 获取订阅邮箱：{github_api_url}")
         email_list = extract_emails_from_issues(github_api_url)
-        if email_list == None:
-            logging.info("无邮箱列表，请检查您的订阅列表是否有订阅者或订阅格式是否正确")
+
+        if not email_list:
+            logging.info("⚠️ 无订阅邮箱，请检查格式或是否有订阅者")
             sys.exit(0)
-        else:
-            logging.info("获取到的邮箱列表为：{email_list}".format(email_list=email_list))
-        # 循环latest_articles，发送邮件
+
+        logging.info(f"📬 获取到邮箱列表：{email_list}")
+
         for article in latest_articles:
             template_data = {
                 "title": article["title"],
@@ -97,17 +138,25 @@ if config["rss_subscribe"]["enable"]:
                 "published": article["published"],
                 "link": article["link"],
                 "website_title": website_title,
-                "github_issue_url": f"https://github.com/{github_username}/{github_repo}/issues?q=is%3Aissue+is%3Aclosed",
+                "github_issue_url": (
+                    f"https://github.com/{github_username}/{github_repo}"
+                    "/issues?q=is%3Aissue+is%3Aclosed"
+                ),
             }
-            
+
             send_emails(
                 emails=email_list["emails"],
-                sender_email=email,
+                sender_email=sender_email,
                 smtp_server=server,
                 port=port,
                 password=password,
-                subject= website_title + "の最新文章：" + article["title"],
-                body="文章链接：" + article["link"] + "\n" + "文章内容：" + article["summary"] + "\n" + "发布时间：" + article["published"],
+                subject=f"{website_title} の最新文章：{article['title']}",
+                body=(
+                    f"📄 文章标题：{article['title']}\n"
+                    f"🔗 链接：{article['link']}\n"
+                    f"📝 简介：{article['summary']}\n"
+                    f"🕒 发布时间：{article['published']}"
+                ),
                 template_path=email_template,
                 template_data=template_data,
                 use_tls=use_tls