From 5b47d2d3bc75c659717a48b7b7179733bf34b0f4 Mon Sep 17 00:00:00 2001 From: LiuShen <3162475700@qq.com> Date: Sat, 3 May 2025 15:45:52 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=97=E6=9B=B4=E5=8A=A0=E5=90=88?= =?UTF-8?q?=E7=90=86=E7=9A=84=E8=AF=B7=E6=B1=82=E5=A4=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- friend_circle_lite/get_info.py | 58 ++++++++-- rss_subscribe/push_article_update.py | 13 ++- run.py | 165 +++++++++++++++++---------- 3 files changed, 163 insertions(+), 73 deletions(-) diff --git a/friend_circle_lite/get_info.py b/friend_circle_lite/get_info.py index 5127c1c..ea43301 100644 --- a/friend_circle_lite/get_info.py +++ b/friend_circle_lite/get_info.py @@ -1,14 +1,36 @@ import logging from datetime import datetime, timedelta, timezone +import re +from typing import Any +from urllib.parse import urljoin, urlparse from dateutil import parser import requests -import re import feedparser from concurrent.futures import ThreadPoolExecutor, as_completed # 标准化的请求头 -headers = { - 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' +HEADERS_JSON = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36 " + "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)" + ), + "X-Friend-Circle": "1.0" +} + +HEADERS_XML = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36 " + "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)" + ), + "Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "X-Friend-Circle": "1.0" } timeout = (10, 15) # 连接超时和读取超时,防止requests接受时间过长 @@ -86,7 +108,7 @@ def check_feed(blog_url, session): for feed_type, path in possible_feeds: feed_url = blog_url.rstrip('/') + path try: - response = session.get(feed_url, headers=headers, timeout=timeout) + response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout) if response.status_code == 200: return [feed_type, feed_url] except requests.RequestException: @@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''): dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。 """ try: - response = session.get(url, headers=headers, timeout=timeout) + response = session.get(url, headers=HEADERS_XML, timeout=timeout) response.encoding = response.apparent_encoding or 'utf-8' feed = feedparser.parse(response.text) result = { - 'website_name': feed.feed.title if 'title' in feed.feed else '', - 'author': feed.feed.author if 'author' in feed.feed else '', - 'link': feed.feed.link if 'link' in feed.feed else '', + 'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore + 'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore + 'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore 'articles': [] } @@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''): logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间") # 处理链接中可能存在的错误,比如ip或localhost - article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' + article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore article = { 'title': entry.title if 'title' in entry else '', @@ -177,7 +199,19 @@ def replace_non_domain(link: str, blog_url: str) -> str: # path = re.sub(r'^https?://[^/]+', '', link) # print(path) - return link + try: + parsed = urlparse(link) + if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc): # IP地址或localhost + # 提取 path + query + path = parsed.path or '/' + if parsed.query: + path += '?' + parsed.query + return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/')) + else: + return link # 合法域名则返回原链接 + except Exception as e: + logging.warning(f"替换链接时出错:{link}, error: {e}") + return link def process_friend(friend, session, count, specific_RSS=[]): """ @@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5): session = requests.Session() try: - response = session.get(json_url, headers=headers, timeout=timeout) + response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout) friends_data = response.json() except Exception as e: logging.error(f"无法获取链接:{json_url} :{e}", exc_info=True) @@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url): dict: 合并后的文章信息字典,已去重处理 """ try: - response = requests.get(marge_json_url, headers=headers, timeout=timeout) + response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout) marge_data = response.json() except Exception as e: logging.error(f"无法获取链接:{marge_json_url},出现的问题为:{e}", exc_info=True) diff --git a/rss_subscribe/push_article_update.py b/rss_subscribe/push_article_update.py index df3ccec..d43f1ad 100644 --- a/rss_subscribe/push_article_update.py +++ b/rss_subscribe/push_article_update.py @@ -6,10 +6,17 @@ import json import os # 标准化的请求头 -headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' +HEADERS_JSON = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36 " + "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)" + ), + "X-Friend-Circle": "1.0" } + def extract_emails_from_issues(api_url): """ 从GitHub issues API中提取以[e-mail]开头的title中的邮箱地址。 @@ -26,7 +33,7 @@ def extract_emails_from_issues(api_url): } """ try: - response = requests.get(api_url, headers=headers) + response = requests.get(api_url, headers=HEADERS_JSON, timeout=10) response.raise_for_status() issues = response.json() except Exception as e: diff --git a/run.py b/run.py index 29f7c34..df21df6 100644 --- a/run.py +++ b/run.py @@ -1,95 +1,136 @@ -# 引入 check_feed 和 parse_feed 函数 -from friend_circle_lite.get_info import fetch_and_process_data, sort_articles_by_time, marge_data_from_json_url, marge_errors_from_json_url, deal_with_large_data -from friend_circle_lite.get_conf import load_config -from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues -from push_rss_update.send_email import send_emails - import logging import json import sys import os -# 日志记录 -logging.basicConfig(level=logging.INFO, format='😋 %(levelname)s: %(message)s') +from friend_circle_lite.get_info import ( + fetch_and_process_data, + marge_data_from_json_url, + marge_errors_from_json_url, + deal_with_large_data +) +from friend_circle_lite.get_conf import load_config +from rss_subscribe.push_article_update import ( + get_latest_articles_from_link, + extract_emails_from_issues +) +from push_rss_update.send_email import send_emails +# ========== 日志设置 ========== +logging.basicConfig( + level=logging.INFO, + format='😋 %(levelname)s: %(message)s' +) -# 爬虫部分内容 +# ========== 加载配置 ========== config = load_config("./conf.yaml") + +# ========== 爬虫模块 ========== if config["spider_settings"]["enable"]: - logging.info("爬虫已启用") + logging.info("✅ 爬虫已启用") + json_url = config['spider_settings']['json_url'] article_count = config['spider_settings']['article_count'] - specific_RSS = config['specific_RSS'] - logging.info("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count)) - result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count) + specific_rss = config['specific_RSS'] + + logging.info(f"📥 正在从 {json_url} 获取数据,每个博客获取 {article_count} 篇文章") + result, lost_friends = fetch_and_process_data( + json_url=json_url, + specific_RSS=specific_rss, + count=article_count + ) # type: ignore + if config["spider_settings"]["merge_result"]["enable"]: - marge_json_url = config['spider_settings']["merge_result"]['merge_json_url'] - logging.info("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json")) - result = marge_data_from_json_url(result, marge_json_url + "/all.json") - lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json") - logging.info("数据获取完毕,目前共有 {count} 位好友的动态,正在处理数据".format(count=len(result.get("article_data", [])))) + merge_url = config['spider_settings']["merge_result"]['merge_json_url'] + logging.info(f"🔀 合并功能开启,从 {merge_url} 获取外部数据") + + result = marge_data_from_json_url(result, f"{merge_url}/all.json") + lost_friends = marge_errors_from_json_url(lost_friends, f"{merge_url}/errors.json") + + article_count = len(result.get("article_data", [])) + logging.info(f"📦 数据获取完毕,共有 {article_count} 位好友的动态,正在处理数据") + result = deal_with_large_data(result) with open("all.json", "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) + with open("errors.json", "w", encoding="utf-8") as f: json.dump(lost_friends, f, ensure_ascii=False, indent=2) +# ========== 推送准备 ========== if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]: - logging.info("推送功能已启用,正在准备推送,获取配置信息") - email_settings = config["smtp"] - email = email_settings["email"] - server = email_settings["server"] - port = email_settings["port"] - use_tls = email_settings["use_tls"] + logging.info("📨 推送功能已启用,正在准备中...") + + smtp_conf = config["smtp"] + sender_email = smtp_conf["email"] + server = smtp_conf["server"] + port = smtp_conf["port"] + use_tls = smtp_conf["use_tls"] password = os.getenv("SMTP_PWD") - logging.info("SMTP 服务器信息:{server}:{port}".format(server=server, port=port)) - logging.info("密码:{pwd}************".format(pwd=password[:3])) + logging.info(f"📡 SMTP 服务器:{server}:{port}") + if not password: + logging.error("❌ 环境变量 SMTP_PWD 未设置,无法发送邮件") + sys.exit(1) + else: + logging.info(f"🔐 密码(部分):{password[:3]}*****") + +# ========== 邮件推送(待实现)========== if config["email_push"]["enable"]: - logging.info("邮件推送已启用") - logging.info("抱歉,目前暂未实现功能") - -if config["rss_subscribe"]["enable"]: - logging.info("RSS 订阅推送已启用") - # 获取并强制转换为字符串 - # 尝试从环境变量获取 FCL_REPO - fcl_repo = os.getenv('FCL_REPO') + logging.info("📧 邮件推送已启用") + logging.info("⚠️ 抱歉,目前尚未实现邮件推送功能") - # 提取 github_username 和 github_repo +# ========== RSS 订阅推送 ========== +if config["rss_subscribe"]["enable"]: + logging.info("📰 RSS 订阅推送已启用") + + smtp_conf = config["smtp"] + sender_email = smtp_conf["email"] + server = smtp_conf["server"] + port = smtp_conf["port"] + use_tls = smtp_conf["use_tls"] + password = os.getenv("SMTP_PWD") + + # 获取 GitHub 仓库信息 + fcl_repo = os.getenv('FCL_REPO') if fcl_repo: github_username, github_repo = fcl_repo.split('/') else: github_username = str(config["rss_subscribe"]["github_username"]).strip() github_repo = str(config["rss_subscribe"]["github_repo"]).strip() - - # 输出 github_username 和 github_repo - logging.info("github_username: {github_username}".format(github_username=github_username)) - logging.info("github_repo: {github_repo}".format(github_repo=github_repo)) - + + logging.info(f"👤 GitHub 用户名:{github_username}") + logging.info(f"📁 GitHub 仓库:{github_repo}") + your_blog_url = config["rss_subscribe"]["your_blog_url"] email_template = config["rss_subscribe"]["email_template"] - # 获取网站信息 website_title = config["rss_subscribe"]["website_info"]["title"] - # 获取最近更新的文章 + latest_articles = get_latest_articles_from_link( url=your_blog_url, count=5, last_articles_path="./rss_subscribe/last_articles.json" - ) - logging.info("获取到的最新文章为:{latest_articles}".format(latest_articles=latest_articles)) - if latest_articles == None: - logging.info("无未进行推送的新文章") + ) + + if not latest_articles: + logging.info("📭 无新文章,无需推送") else: - github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200" - logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url)) + logging.info(f"🆕 获取到的最新文章:{latest_articles}") + + github_api_url = ( + f"https://api.github.com/repos/{github_username}/{github_repo}/issues" + f"?state=closed&label=subscribed&per_page=200" + ) + logging.info(f"🔎 正在从 GitHub 获取订阅邮箱:{github_api_url}") email_list = extract_emails_from_issues(github_api_url) - if email_list == None: - logging.info("无邮箱列表,请检查您的订阅列表是否有订阅者或订阅格式是否正确") + + if not email_list: + logging.info("⚠️ 无订阅邮箱,请检查格式或是否有订阅者") sys.exit(0) - else: - logging.info("获取到的邮箱列表为:{email_list}".format(email_list=email_list)) - # 循环latest_articles,发送邮件 + + logging.info(f"📬 获取到邮箱列表:{email_list}") + for article in latest_articles: template_data = { "title": article["title"], @@ -97,17 +138,25 @@ if config["rss_subscribe"]["enable"]: "published": article["published"], "link": article["link"], "website_title": website_title, - "github_issue_url": f"https://github.com/{github_username}/{github_repo}/issues?q=is%3Aissue+is%3Aclosed", + "github_issue_url": ( + f"https://github.com/{github_username}/{github_repo}" + "/issues?q=is%3Aissue+is%3Aclosed" + ), } - + send_emails( emails=email_list["emails"], - sender_email=email, + sender_email=sender_email, smtp_server=server, port=port, password=password, - subject= website_title + "の最新文章:" + article["title"], - body="文章链接:" + article["link"] + "\n" + "文章内容:" + article["summary"] + "\n" + "发布时间:" + article["published"], + subject=f"{website_title} の最新文章:{article['title']}", + body=( + f"📄 文章标题:{article['title']}\n" + f"🔗 链接:{article['link']}\n" + f"📝 简介:{article['summary']}\n" + f"🕒 发布时间:{article['published']}" + ), template_path=email_template, template_data=template_data, use_tls=use_tls