diff --git a/friend_circle_lite/get_info.py b/friend_circle_lite/get_info.py index 577ed43..4239605 100644 --- a/friend_circle_lite/get_info.py +++ b/friend_circle_lite/get_info.py @@ -2,15 +2,16 @@ import logging from datetime import datetime, timedelta, timezone from dateutil import parser import requests +import re import feedparser from concurrent.futures import ThreadPoolExecutor, as_completed # 设置日志配置 -logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s') +logging.basicConfig(level=logging.INFO, format='🤪%(levelname)s: %(message)s') # 标准化的请求头 headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' } timeout = (10, 15) # 连接超时和读取超时,防止requests接受时间过长 @@ -55,16 +56,16 @@ def format_published_time(time_str): return shanghai_time.strftime('%Y-%m-%d %H:%M') -def check_feed(blog_url, session): - """ - 检查博客的 RSS 或 Atom 订阅链接。 - 此函数接受一个博客地址,尝试在其后拼接 '/atom.xml', '/rss2.xml' 和 '/feed',并检查这些链接是否可访问。 - Atom 优先,如果都不能访问,则返回 ['none', 源地址]。 +def check_feed(blog_url, session, headers=None, timeout=10): + """ + 检查博客的 RSS 或 Atom 订阅链接,使用多线程提高效率,禁止重定向。 参数: blog_url (str): 博客的基础 URL。 session (requests.Session): 用于请求的会话对象。 + headers (dict, 可选): 自定义请求头。 + timeout (int, 可选): 请求的超时限制,默认为 10 秒。 返回: list: 包含类型和拼接后的链接的列表。如果 atom 链接可访问,则返回 ['atom', atom_url]; @@ -75,26 +76,40 @@ def check_feed(blog_url, session): possible_feeds = [ ('atom', '/atom.xml'), - ('rss', '/rss.xml'), # 2024-07-26 添加 /rss.xml内容的支持 + ('rss', '/rss.xml'), ('rss2', '/rss2.xml'), ('feed', '/feed'), - ('feed2', '/feed.xml'), # 2024-07-26 添加 /feed.xml内容的支持 + ('feed2', '/feed.xml'), ('feed3', '/feed/'), - ('index', '/index.xml') # 2024-07-25 添加 /index.xml内容的支持 + ('index', '/index.xml') ] - for feed_type, path in possible_feeds: + def fetch_feed(feed_type, path): feed_url = blog_url.rstrip('/') + path try: - response = session.get(feed_url, headers=headers, timeout=timeout) + response = session.get(feed_url, headers=headers, timeout=timeout, allow_redirects=False) if response.status_code == 200: return [feed_type, feed_url] + elif response.status_code in [301, 302, 303]: + return None # 重定向,不处理 except requests.RequestException: - continue + return None # 请求异常,不处理 + + # 使用 ThreadPoolExecutor 执行多个线程 + with ThreadPoolExecutor() as executor: + futures = [executor.submit(fetch_feed, feed_type, path) for feed_type, path in possible_feeds] + + # 等待线程完成并获取结果 + for future in as_completed(futures): + result = future.result() + if result: + return result # 如果找到有效的订阅链接,返回 + logging.warning(f"无法找到 {blog_url} 的订阅链接") return ['none', blog_url] -def parse_feed(url, session, count=5): + +def parse_feed(url, session, count=5, blog_url=''): """ 解析 Atom 或 RSS2 feed 并返回包含网站名称、作者、原链接和每篇文章详细内容的字典。 @@ -121,7 +136,7 @@ def parse_feed(url, session, count=5): 'articles': [] } - for i, entry in enumerate(feed.entries): + for _ , entry in enumerate(feed.entries): if 'published' in entry: published = format_published_time(entry.published) @@ -131,11 +146,15 @@ def parse_feed(url, session, count=5): logging.warning(f"文章 {entry.title} 未包含发布时间,已使用更新时间 {published}") else: published = '' - logging.warning(f"文章 {entry.title} 未包含任何时间信息") + logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间") + + # 处理链接中可能存在的错误,比如ip或localhost + article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' + article = { 'title': entry.title if 'title' in entry else '', 'author': result['author'], - 'link': entry.link if 'link' in entry else '', + 'link': article_link, 'published': published, 'summary': entry.summary if 'summary' in entry else '', 'content': entry.content[0].value if 'content' in entry and entry.content else entry.description if 'description' in entry else '' @@ -149,7 +168,7 @@ def parse_feed(url, session, count=5): return result except Exception as e: - logging.error(f"无法解析FEED地址:{url} ,请自行排查原因!", exc_info=True) + logging.error(f"无法解析FEED地址:{url} ,请自行排查原因!") return { 'website_name': '', 'author': '', @@ -157,6 +176,23 @@ def parse_feed(url, session, count=5): 'articles': [] } +def replace_non_domain(link: str, blog_url: str) -> str: + """ + 暂未实现 + 检测并替换字符串中的非正常域名部分(如 IP 地址或 localhost),替换为 blog_url。 + 替换后强制使用 https,且考虑 blog_url 尾部是否有斜杠。 + + :param link: 原始地址字符串 + :param blog_url: 替换为的博客地址 + :return: 替换后的地址字符串 + """ + + # 提取link中的路径部分,无需协议和域名 + # path = re.sub(r'^https?://[^/]+', '', link) + # print(path) + + return link + def process_friend(friend, session, count, specific_RSS=[]): """ 处理单个朋友的博客信息。 @@ -179,13 +215,13 @@ def process_friend(friend, session, count, specific_RSS=[]): if rss_feed: feed_url = rss_feed feed_type = 'specific' - logging.info(f"“{name}”的博客“{blog_url}”为特定RSS源“{feed_url}”") + logging.info(f"“{name}”的博客“ {blog_url} ”为特定RSS源“ {feed_url} ”") else: feed_type, feed_url = check_feed(blog_url, session) - logging.info(f"“{name}”的博客“{blog_url}”的feed类型为“{feed_type}”") + logging.info(f"“{name}”的博客“ {blog_url} ”的feed类型为“{feed_type}”, feed地址为“ {feed_url} ”") if feed_type != 'none': - feed_info = parse_feed(feed_url, session, count) + feed_info = parse_feed(feed_url, session, count, blog_url) articles = [ { 'title': article['title'], diff --git a/push_rss_update/send_email.py b/push_rss_update/send_email.py index ed04ee3..71e3042 100644 --- a/push_rss_update/send_email.py +++ b/push_rss_update/send_email.py @@ -1,9 +1,13 @@ +import logging import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from jinja2 import Environment, FileSystemLoader import os +logging.basicConfig(level=logging.INFO, format='😬%(levelname)s: %(message)s') + + def email_sender( target_email, sender_email, @@ -56,7 +60,7 @@ def email_sender( server.sendmail(sender_email, target_email, msg.as_string()) print(f'邮件已发送到 {target_email}') except Exception as e: - print(f'无法发送邮件到 {target_email}. 错误: {e}') + logging.error(f'邮件发送失败,目标地址: {target_email},错误信息: {e}') def send_emails(emails, sender_email, smtp_server, port, password, subject, body, template_path=None, template_data=None, use_tls=True): """ @@ -75,6 +79,5 @@ def send_emails(emails, sender_email, smtp_server, port, password, subject, body use_tls (bool): 是否使用 TLS 加密。默认为 True。 """ for email in emails: - print(f'正在发送邮件到 {email}') - print(f'---------------------------\n邮件主题: {subject}\n邮件内容: {body}\n发件人: {sender_email}\n---------------------------') + logging.info(f'正在发送邮件到 {email},邮件内容: {subject}') email_sender(email, sender_email, smtp_server, port, password, subject, body, template_path, template_data, use_tls) \ No newline at end of file diff --git a/rss_subscribe/push_article_update.py b/rss_subscribe/push_article_update.py index 419a98c..4823067 100644 --- a/rss_subscribe/push_article_update.py +++ b/rss_subscribe/push_article_update.py @@ -1,9 +1,14 @@ +import logging import requests import re from friend_circle_lite.get_info import check_feed, parse_feed import json import os +# 日志配置 +logging.basicConfig(level=logging.INFO, format='⭐%(levelname)s: %(message)s') + + # 标准化的请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' @@ -29,7 +34,7 @@ def extract_emails_from_issues(api_url): response.raise_for_status() issues = response.json() except Exception as e: - print(f"无法获取该链接:{api_url}\n出现的问题为:{e}") + logging.error(f"无法获取 GitHub issues 数据,错误信息: {e}") return None email_pattern = re.compile(r'^\[邮箱订阅\](.+)$') @@ -62,7 +67,7 @@ def get_latest_articles_from_link(url, count=5, last_articles_path="./rss_subscr session = requests.Session() feed_type, feed_url = check_feed(url, session) if feed_type == 'none': - print(f"无法访问 {url} 的 feed") + logging.error(f"无法获取 {url} 的文章数据") return None # 获取最新的文章数据 @@ -86,7 +91,7 @@ def get_latest_articles_from_link(url, count=5, last_articles_path="./rss_subscr if article['link'] not in last_titles: updated_articles.append(article) - print(f"从 {url} 获取到 {len(latest_articles)} 篇文章,其中 {len(updated_articles)} 篇为新文章") + logging.info(f"从 {url} 获取到 {len(latest_articles)} 篇文章,其中 {len(updated_articles)} 篇为新文章") # 更新本地存储的文章数据 with open(local_file, 'w', encoding='utf-8') as file: diff --git a/run.py b/run.py index fcb1409..64e2a16 100644 --- a/run.py +++ b/run.py @@ -4,24 +4,30 @@ from friend_circle_lite.get_conf import load_config from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues from push_rss_update.send_email import send_emails +import logging import json import sys import os +# 日志记录 +logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s') + + # 爬虫部分内容 config = load_config("./conf.yaml") if config["spider_settings"]["enable"]: - print("爬虫已启用") + logging.info("爬虫已启用") json_url = config['spider_settings']['json_url'] article_count = config['spider_settings']['article_count'] specific_RSS = config['specific_RSS'] - print("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count)) + logging.info("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count)) result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count) if config["spider_settings"]["merge_result"]["enable"]: marge_json_url = config['spider_settings']["merge_result"]['merge_json_url'] - print("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json")) + logging.info("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json")) result = marge_data_from_json_url(result, marge_json_url + "/all.json") lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json") + logging.info("数据获取完毕,目前共有 {count} 位好友的动态,正在处理数据".format(count=len(result.get("article_data", [])))) result = deal_with_large_data(result) with open("all.json", "w", encoding="utf-8") as f: @@ -30,20 +36,22 @@ if config["spider_settings"]["enable"]: json.dump(lost_friends, f, ensure_ascii=False, indent=2) if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]: - print("获取smtp配置信息") + logging.info("推送功能已启用,正在准备推送,获取配置信息") email_settings = config["smtp"] email = email_settings["email"] server = email_settings["server"] port = email_settings["port"] use_tls = email_settings["use_tls"] password = os.getenv("SMTP_PWD") - print("密码检测是否存在:", password[:2], "****", password[-2:]) + logging.info("SMTP 服务器信息:{server}:{port}".format(server=server, port=port)) + logging.info("密码:{pwd}************".format(pwd=password[:3])) if config["email_push"]["enable"]: - print("邮件推送已启用") + logging.info("邮件推送已启用") + logging.info("抱歉,目前暂未实现功能") if config["rss_subscribe"]["enable"]: - print("RSS通过issue订阅已启用") + logging.info("RSS 订阅推送已启用") # 获取并强制转换为字符串 # 尝试从环境变量获取 FCL_REPO fcl_repo = os.getenv('FCL_REPO') @@ -51,13 +59,13 @@ if config["rss_subscribe"]["enable"]: # 提取 github_username 和 github_repo if fcl_repo: github_username, github_repo = fcl_repo.split('/') - print(f"从环境变量获取到的 GitHub Username: {github_username}") - print(f"从环境变量获取到的 GitHub Repo: {github_repo}") else: github_username = str(config["rss_subscribe"]["github_username"]).strip() github_repo = str(config["rss_subscribe"]["github_repo"]).strip() - print(f"从配置文件获取到的 GitHub Username: {github_username}") - print(f"从配置文件获取到的 GitHub Repo: {github_repo}") + + # 输出 github_username 和 github_repo + logging.info("github_username: {github_username}".format(github_username=github_username)) + logging.info("github_repo: {github_repo}".format(github_repo=github_repo)) your_blog_url = config["rss_subscribe"]["your_blog_url"] email_template = config["rss_subscribe"]["email_template"] @@ -69,18 +77,18 @@ if config["rss_subscribe"]["enable"]: count=5, last_articles_path="./rss_subscribe/last_articles.json" ) - print("最新文章为:", latest_articles) + logging.info("获取到的最新文章为:{latest_articles}".format(latest_articles=latest_articles)) if latest_articles == None: - print("没有新文章") + logging.info("无未进行推送的新文章") else: github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200" - print("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url)) + logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url)) email_list = extract_emails_from_issues(github_api_url) if email_list == None: - print("无邮箱列表") + logging.info("无邮箱列表,请检查您的订阅列表是否有订阅者或订阅格式是否正确") sys.exit(0) else: - print("获取到的邮箱列表为:", email_list) + logging.info("获取到的邮箱列表为:{email_list}".format(email_list=email_list)) # 循环latest_articles,发送邮件 for article in latest_articles: template_data = {