🤗更加合理的请求头

2025-05-03 15:45:52 +08:00
parent ec4328cbe0
commit 5b47d2d3bc
3 changed files with 163 additions and 73 deletions
@@ -1,14 +1,36 @@
 import logging
 from datetime import datetime, timedelta, timezone
 import re
 from typing import Any
 from urllib.parse import urljoin, urlparse
 from dateutil import parser
 import requests
 import re
 import feedparser
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # 标准化的请求头
-headers = {
+HEADERS_JSON = {
-    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
+    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36 "
        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
    ),
    "X-Friend-Circle": "1.0"
 }
 HEADERS_XML = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36 "
        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
    ),
    "Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "X-Friend-Circle": "1.0"
 }
 timeout = (10, 15) # 连接超时和读取超时，防止requests接受时间过长
@@ -86,7 +108,7 @@ def check_feed(blog_url, session):
    for feed_type, path in possible_feeds:
        feed_url = blog_url.rstrip('/') + path
        try:
-            response = session.get(feed_url, headers=headers, timeout=timeout)
+            response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout)
            if response.status_code == 200:
                return [feed_type, feed_url]
        except requests.RequestException:
@@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''):
    dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
    """
    try:
-        response = session.get(url, headers=headers, timeout=timeout)
+        response = session.get(url, headers=HEADERS_XML, timeout=timeout)
        response.encoding = response.apparent_encoding or 'utf-8'
        feed = feedparser.parse(response.text)
        result = {
-            'website_name': feed.feed.title if 'title' in feed.feed else '',
+            'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore
-            'author': feed.feed.author if 'author' in feed.feed else '',
+            'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore
-            'link': feed.feed.link if 'link' in feed.feed else '',
+            'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore
            'articles': []
        }
@@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''):
                logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
            # 处理链接中可能存在的错误，比如ip或localhost
-            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
+            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore
            article = {
                'title': entry.title if 'title' in entry else '',
@@ -177,7 +199,19 @@ def replace_non_domain(link: str, blog_url: str) -> str:
    # path = re.sub(r'^https?://[^/]+', '', link)
    # print(path)
-    return link
+    try:
        parsed = urlparse(link)
        if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc):  # IP地址或localhost
            # 提取 path + query
            path = parsed.path or '/'
            if parsed.query:
                path += '?' + parsed.query
            return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/'))
        else:
            return link  # 合法域名则返回原链接
    except Exception as e:
        logging.warning(f"替换链接时出错：{link}, error: {e}")
        return link
 def process_friend(friend, session, count, specific_RSS=[]):
    """
@@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5):
    session = requests.Session()
    try:
-        response = session.get(json_url, headers=headers, timeout=timeout)
+        response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout)
        friends_data = response.json()
    except Exception as e:
        logging.error(f"无法获取链接：{json_url} ：{e}", exc_info=True)
@@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url):
    dict: 合并后的文章信息字典，已去重处理
    """
    try:
-        response = requests.get(marge_json_url, headers=headers, timeout=timeout)
+        response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout)
        marge_data = response.json()
    except Exception as e:
        logging.error(f"无法获取链接：{marge_json_url}，出现的问题为：{e}", exc_info=True)
@@ -6,10 +6,17 @@ import json
 import os
 # 标准化的请求头
-headers = {
+HEADERS_JSON = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36 "
        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
    ),
    "X-Friend-Circle": "1.0"
 }
 def extract_emails_from_issues(api_url):
    """
    从GitHub issues API中提取以[e-mail]开头的title中的邮箱地址。
@@ -26,7 +33,7 @@ def extract_emails_from_issues(api_url):
    }
    """
    try:
-        response = requests.get(api_url, headers=headers)
+        response = requests.get(api_url, headers=HEADERS_JSON, timeout=10)
        response.raise_for_status()
        issues = response.json()
    except Exception as e:
@@ -1,95 +1,136 @@
 # 引入 check_feed 和 parse_feed 函数
 from friend_circle_lite.get_info import fetch_and_process_data, sort_articles_by_time, marge_data_from_json_url, marge_errors_from_json_url, deal_with_large_data
 from friend_circle_lite.get_conf import load_config
 from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues
 from push_rss_update.send_email import send_emails
 import logging
 import json
 import sys
 import os
-# 日志记录
+from friend_circle_lite.get_info import (
-logging.basicConfig(level=logging.INFO, format='😋 %(levelname)s: %(message)s')
+    fetch_and_process_data,
    marge_data_from_json_url,
    marge_errors_from_json_url,
    deal_with_large_data
 )
 from friend_circle_lite.get_conf import load_config
 from rss_subscribe.push_article_update import (
    get_latest_articles_from_link,
    extract_emails_from_issues
 )
 from push_rss_update.send_email import send_emails
 # ========== 日志设置 ==========
 logging.basicConfig(
    level=logging.INFO,
    format='😋 %(levelname)s: %(message)s'
 )
-# 爬虫部分内容
+# ========== 加载配置 ==========
 config = load_config("./conf.yaml")
 # ========== 爬虫模块 ==========
 if config["spider_settings"]["enable"]:
-    logging.info("爬虫已启用")
+    logging.info("✅ 爬虫已启用")
    json_url = config['spider_settings']['json_url']
    article_count = config['spider_settings']['article_count']
-    specific_RSS = config['specific_RSS']
+    specific_rss = config['specific_RSS']
-    logging.info("正在从 {json_url} 中获取，每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
+
-    result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
+    logging.info(f"📥 正在从 {json_url} 获取数据，每个博客获取 {article_count} 篇文章")
    result, lost_friends = fetch_and_process_data(
        json_url=json_url,
        specific_RSS=specific_rss,
        count=article_count
    ) # type: ignore
    if config["spider_settings"]["merge_result"]["enable"]:
-        marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
+        merge_url = config['spider_settings']["merge_result"]['merge_json_url']
-        logging.info("合并数据功能开启，从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
+        logging.info(f"🔀 合并功能开启，从 {merge_url} 获取外部数据")
-        result = marge_data_from_json_url(result, marge_json_url + "/all.json")
+
-        lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json")
+        result = marge_data_from_json_url(result, f"{merge_url}/all.json")
-    logging.info("数据获取完毕，目前共有 {count} 位好友的动态，正在处理数据".format(count=len(result.get("article_data", []))))
+        lost_friends = marge_errors_from_json_url(lost_friends, f"{merge_url}/errors.json")
    article_count = len(result.get("article_data", []))
    logging.info(f"📦 数据获取完毕，共有 {article_count} 位好友的动态，正在处理数据")
    result = deal_with_large_data(result)
    with open("all.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    with open("errors.json", "w", encoding="utf-8") as f:
        json.dump(lost_friends, f, ensure_ascii=False, indent=2)
 # ========== 推送准备 ==========
 if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
-    logging.info("推送功能已启用，正在准备推送，获取配置信息")
+    logging.info("📨 推送功能已启用，正在准备中...")
-    email_settings = config["smtp"]
+
-    email = email_settings["email"]
+    smtp_conf = config["smtp"]
-    server = email_settings["server"]
+    sender_email = smtp_conf["email"]
-    port = email_settings["port"]
+    server = smtp_conf["server"]
-    use_tls = email_settings["use_tls"]
+    port = smtp_conf["port"]
    use_tls = smtp_conf["use_tls"]
    password = os.getenv("SMTP_PWD")
    logging.info("SMTP 服务器信息：{server}:{port}".format(server=server, port=port))
    logging.info("密码：{pwd}************".format(pwd=password[:3]))
    logging.info(f"📡 SMTP 服务器：{server}:{port}")
    if not password:
        logging.error("❌ 环境变量 SMTP_PWD 未设置，无法发送邮件")
        sys.exit(1)
    else:
        logging.info(f"🔐 密码(部分)：{password[:3]}*****")
 # ========== 邮件推送（待实现）==========
 if config["email_push"]["enable"]:
-    logging.info("邮件推送已启用")
+    logging.info("📧 邮件推送已启用")
-    logging.info("抱歉，目前暂未实现功能")
+    logging.info("⚠️ 抱歉，目前尚未实现邮件推送功能")
 if config["rss_subscribe"]["enable"]:
    logging.info("RSS 订阅推送已启用")
    # 获取并强制转换为字符串
    # 尝试从环境变量获取 FCL_REPO
    fcl_repo = os.getenv('FCL_REPO')
-    # 提取 github_username 和 github_repo
+# ========== RSS 订阅推送 ==========
 if config["rss_subscribe"]["enable"]:
    logging.info("📰 RSS 订阅推送已启用")
    smtp_conf = config["smtp"]
    sender_email = smtp_conf["email"]
    server = smtp_conf["server"]
    port = smtp_conf["port"]
    use_tls = smtp_conf["use_tls"]
    password = os.getenv("SMTP_PWD")
    # 获取 GitHub 仓库信息
    fcl_repo = os.getenv('FCL_REPO')
    if fcl_repo:
        github_username, github_repo = fcl_repo.split('/')
    else:
        github_username = str(config["rss_subscribe"]["github_username"]).strip()
        github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
-    
+
-    # 输出 github_username 和 github_repo
+    logging.info(f"👤 GitHub 用户名：{github_username}")
-    logging.info("github_username: {github_username}".format(github_username=github_username))
+    logging.info(f"📁 GitHub 仓库：{github_repo}")
-    logging.info("github_repo: {github_repo}".format(github_repo=github_repo))
+
    your_blog_url = config["rss_subscribe"]["your_blog_url"]
    email_template = config["rss_subscribe"]["email_template"]
    # 获取网站信息
    website_title = config["rss_subscribe"]["website_info"]["title"]
-    # 获取最近更新的文章
+
    latest_articles = get_latest_articles_from_link(
        url=your_blog_url,
        count=5,
        last_articles_path="./rss_subscribe/last_articles.json"
-        )
+    )
-    logging.info("获取到的最新文章为：{latest_articles}".format(latest_articles=latest_articles))
+
-    if latest_articles == None:
+    if not latest_articles:
-        logging.info("无未进行推送的新文章")
+        logging.info("📭 无新文章，无需推送")
    else:
-        github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200"
+        logging.info(f"🆕 获取到的最新文章：{latest_articles}")
-        logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
+
        github_api_url = (
            f"https://api.github.com/repos/{github_username}/{github_repo}/issues"
            f"?state=closed&label=subscribed&per_page=200"
        )
        logging.info(f"🔎 正在从 GitHub 获取订阅邮箱：{github_api_url}")
        email_list = extract_emails_from_issues(github_api_url)
-        if email_list == None:
+
-            logging.info("无邮箱列表，请检查您的订阅列表是否有订阅者或订阅格式是否正确")
+        if not email_list:
            logging.info("⚠️ 无订阅邮箱，请检查格式或是否有订阅者")
            sys.exit(0)
-        else:
+
-            logging.info("获取到的邮箱列表为：{email_list}".format(email_list=email_list))
+        logging.info(f"📬 获取到邮箱列表：{email_list}")
-        # 循环latest_articles，发送邮件
+
        for article in latest_articles:
            template_data = {
                "title": article["title"],
@@ -97,17 +138,25 @@ if config["rss_subscribe"]["enable"]:
                "published": article["published"],
                "link": article["link"],
                "website_title": website_title,
-                "github_issue_url": f"https://github.com/{github_username}/{github_repo}/issues?q=is%3Aissue+is%3Aclosed",
+                "github_issue_url": (
                    f"https://github.com/{github_username}/{github_repo}"
                    "/issues?q=is%3Aissue+is%3Aclosed"
                ),
            }
-            
+
            send_emails(
                emails=email_list["emails"],
-                sender_email=email,
+                sender_email=sender_email,
                smtp_server=server,
                port=port,
                password=password,
-                subject= website_title + "の最新文章：" + article["title"],
+                subject=f"{website_title} の最新文章：{article['title']}",
-                body="文章链接：" + article["link"] + "\n" + "文章内容：" + article["summary"] + "\n" + "发布时间：" + article["published"],
+                body=(
                    f"📄 文章标题：{article['title']}\n"
                    f"🔗 链接：{article['link']}\n"
                    f"📝 简介：{article['summary']}\n"
                    f"🕒 发布时间：{article['published']}"
                ),
                template_path=email_template,
                template_data=template_data,
                use_tls=use_tls