🤔将获取rss的函数改为多线程(dev)测试中

This commit is contained in:
柳神 2024-11-23 00:43:43 +08:00
parent 55d88561b2
commit aa14d1b706
4 changed files with 95 additions and 43 deletions

View File

@ -2,15 +2,16 @@ import logging
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from dateutil import parser from dateutil import parser
import requests import requests
import re
import feedparser import feedparser
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
# 设置日志配置 # 设置日志配置
logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s') logging.basicConfig(level=logging.INFO, format='🤪%(levelname)s: %(message)s')
# 标准化的请求头 # 标准化的请求头
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
} }
timeout = (10, 15) # 连接超时和读取超时防止requests接受时间过长 timeout = (10, 15) # 连接超时和读取超时防止requests接受时间过长
@ -55,16 +56,16 @@ def format_published_time(time_str):
return shanghai_time.strftime('%Y-%m-%d %H:%M') return shanghai_time.strftime('%Y-%m-%d %H:%M')
def check_feed(blog_url, session):
"""
检查博客的 RSS Atom 订阅链接
此函数接受一个博客地址尝试在其后拼接 '/atom.xml', '/rss2.xml' '/feed'并检查这些链接是否可访问 def check_feed(blog_url, session, headers=None, timeout=10):
Atom 优先如果都不能访问则返回 ['none', 源地址] """
检查博客的 RSS Atom 订阅链接使用多线程提高效率禁止重定向
参数 参数
blog_url (str): 博客的基础 URL blog_url (str): 博客的基础 URL
session (requests.Session): 用于请求的会话对象 session (requests.Session): 用于请求的会话对象
headers (dict, 可选): 自定义请求头
timeout (int, 可选): 请求的超时限制默认为 10
返回 返回
list: 包含类型和拼接后的链接的列表如果 atom 链接可访问则返回 ['atom', atom_url] list: 包含类型和拼接后的链接的列表如果 atom 链接可访问则返回 ['atom', atom_url]
@ -75,26 +76,40 @@ def check_feed(blog_url, session):
possible_feeds = [ possible_feeds = [
('atom', '/atom.xml'), ('atom', '/atom.xml'),
('rss', '/rss.xml'), # 2024-07-26 添加 /rss.xml内容的支持 ('rss', '/rss.xml'),
('rss2', '/rss2.xml'), ('rss2', '/rss2.xml'),
('feed', '/feed'), ('feed', '/feed'),
('feed2', '/feed.xml'), # 2024-07-26 添加 /feed.xml内容的支持 ('feed2', '/feed.xml'),
('feed3', '/feed/'), ('feed3', '/feed/'),
('index', '/index.xml') # 2024-07-25 添加 /index.xml内容的支持 ('index', '/index.xml')
] ]
for feed_type, path in possible_feeds: def fetch_feed(feed_type, path):
feed_url = blog_url.rstrip('/') + path feed_url = blog_url.rstrip('/') + path
try: try:
response = session.get(feed_url, headers=headers, timeout=timeout) response = session.get(feed_url, headers=headers, timeout=timeout, allow_redirects=False)
if response.status_code == 200: if response.status_code == 200:
return [feed_type, feed_url] return [feed_type, feed_url]
elif response.status_code in [301, 302, 303]:
return None # 重定向,不处理
except requests.RequestException: except requests.RequestException:
continue return None # 请求异常,不处理
# 使用 ThreadPoolExecutor 执行多个线程
with ThreadPoolExecutor() as executor:
futures = [executor.submit(fetch_feed, feed_type, path) for feed_type, path in possible_feeds]
# 等待线程完成并获取结果
for future in as_completed(futures):
result = future.result()
if result:
return result # 如果找到有效的订阅链接,返回
logging.warning(f"无法找到 {blog_url} 的订阅链接") logging.warning(f"无法找到 {blog_url} 的订阅链接")
return ['none', blog_url] return ['none', blog_url]
def parse_feed(url, session, count=5):
def parse_feed(url, session, count=5, blog_url=''):
""" """
解析 Atom RSS2 feed 并返回包含网站名称作者原链接和每篇文章详细内容的字典 解析 Atom RSS2 feed 并返回包含网站名称作者原链接和每篇文章详细内容的字典
@ -121,7 +136,7 @@ def parse_feed(url, session, count=5):
'articles': [] 'articles': []
} }
for i, entry in enumerate(feed.entries): for _ , entry in enumerate(feed.entries):
if 'published' in entry: if 'published' in entry:
published = format_published_time(entry.published) published = format_published_time(entry.published)
@ -131,11 +146,15 @@ def parse_feed(url, session, count=5):
logging.warning(f"文章 {entry.title} 未包含发布时间,已使用更新时间 {published}") logging.warning(f"文章 {entry.title} 未包含发布时间,已使用更新时间 {published}")
else: else:
published = '' published = ''
logging.warning(f"文章 {entry.title} 未包含任何时间信息") logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
# 处理链接中可能存在的错误比如ip或localhost
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
article = { article = {
'title': entry.title if 'title' in entry else '', 'title': entry.title if 'title' in entry else '',
'author': result['author'], 'author': result['author'],
'link': entry.link if 'link' in entry else '', 'link': article_link,
'published': published, 'published': published,
'summary': entry.summary if 'summary' in entry else '', 'summary': entry.summary if 'summary' in entry else '',
'content': entry.content[0].value if 'content' in entry and entry.content else entry.description if 'description' in entry else '' 'content': entry.content[0].value if 'content' in entry and entry.content else entry.description if 'description' in entry else ''
@ -149,7 +168,7 @@ def parse_feed(url, session, count=5):
return result return result
except Exception as e: except Exception as e:
logging.error(f"无法解析FEED地址{url} ,请自行排查原因!", exc_info=True) logging.error(f"无法解析FEED地址{url} ,请自行排查原因!")
return { return {
'website_name': '', 'website_name': '',
'author': '', 'author': '',
@ -157,6 +176,23 @@ def parse_feed(url, session, count=5):
'articles': [] 'articles': []
} }
def replace_non_domain(link: str, blog_url: str) -> str:
"""
暂未实现
检测并替换字符串中的非正常域名部分 IP 地址或 localhost替换为 blog_url
替换后强制使用 https且考虑 blog_url 尾部是否有斜杠
:param link: 原始地址字符串
:param blog_url: 替换为的博客地址
:return: 替换后的地址字符串
"""
# 提取link中的路径部分无需协议和域名
# path = re.sub(r'^https?://[^/]+', '', link)
# print(path)
return link
def process_friend(friend, session, count, specific_RSS=[]): def process_friend(friend, session, count, specific_RSS=[]):
""" """
处理单个朋友的博客信息 处理单个朋友的博客信息
@ -179,13 +215,13 @@ def process_friend(friend, session, count, specific_RSS=[]):
if rss_feed: if rss_feed:
feed_url = rss_feed feed_url = rss_feed
feed_type = 'specific' feed_type = 'specific'
logging.info(f"{name}”的博客“{blog_url}”为特定RSS源“{feed_url}") logging.info(f"{name}”的博客“ {blog_url} ”为特定RSS源“ {feed_url} ")
else: else:
feed_type, feed_url = check_feed(blog_url, session) feed_type, feed_url = check_feed(blog_url, session)
logging.info(f"{name}”的博客“{blog_url}”的feed类型为“{feed_type}") logging.info(f"{name}”的博客“ {blog_url} ”的feed类型为“{feed_type}”, feed地址为“ {feed_url} ")
if feed_type != 'none': if feed_type != 'none':
feed_info = parse_feed(feed_url, session, count) feed_info = parse_feed(feed_url, session, count, blog_url)
articles = [ articles = [
{ {
'title': article['title'], 'title': article['title'],

View File

@ -1,9 +1,13 @@
import logging
import smtplib import smtplib
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText from email.mime.text import MIMEText
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
import os import os
logging.basicConfig(level=logging.INFO, format='😬%(levelname)s: %(message)s')
def email_sender( def email_sender(
target_email, target_email,
sender_email, sender_email,
@ -56,7 +60,7 @@ def email_sender(
server.sendmail(sender_email, target_email, msg.as_string()) server.sendmail(sender_email, target_email, msg.as_string())
print(f'邮件已发送到 {target_email}') print(f'邮件已发送到 {target_email}')
except Exception as e: except Exception as e:
print(f'无法发送邮件到 {target_email}. 错误: {e}') logging.error(f'邮件发送失败,目标地址: {target_email},错误信息: {e}')
def send_emails(emails, sender_email, smtp_server, port, password, subject, body, template_path=None, template_data=None, use_tls=True): def send_emails(emails, sender_email, smtp_server, port, password, subject, body, template_path=None, template_data=None, use_tls=True):
""" """
@ -75,6 +79,5 @@ def send_emails(emails, sender_email, smtp_server, port, password, subject, body
use_tls (bool): 是否使用 TLS 加密默认为 True use_tls (bool): 是否使用 TLS 加密默认为 True
""" """
for email in emails: for email in emails:
print(f'正在发送邮件到 {email}') logging.info(f'正在发送邮件到 {email},邮件内容: {subject}')
print(f'---------------------------\n邮件主题: {subject}\n邮件内容: {body}\n发件人: {sender_email}\n---------------------------')
email_sender(email, sender_email, smtp_server, port, password, subject, body, template_path, template_data, use_tls) email_sender(email, sender_email, smtp_server, port, password, subject, body, template_path, template_data, use_tls)

View File

@ -1,9 +1,14 @@
import logging
import requests import requests
import re import re
from friend_circle_lite.get_info import check_feed, parse_feed from friend_circle_lite.get_info import check_feed, parse_feed
import json import json
import os import os
# 日志配置
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# 标准化的请求头 # 标准化的请求头
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
@ -29,7 +34,7 @@ def extract_emails_from_issues(api_url):
response.raise_for_status() response.raise_for_status()
issues = response.json() issues = response.json()
except Exception as e: except Exception as e:
print(f"无法获取该链接:{api_url}\n出现的问题为:{e}") logging.error(f"无法获取 GitHub issues 数据,错误信息: {e}")
return None return None
email_pattern = re.compile(r'^\[邮箱订阅\](.+)$') email_pattern = re.compile(r'^\[邮箱订阅\](.+)$')
@ -62,7 +67,7 @@ def get_latest_articles_from_link(url, count=5, last_articles_path="./rss_subscr
session = requests.Session() session = requests.Session()
feed_type, feed_url = check_feed(url, session) feed_type, feed_url = check_feed(url, session)
if feed_type == 'none': if feed_type == 'none':
print(f"无法访问 {url} 的 feed") logging.error(f"无法获取 {url} 的文章数据")
return None return None
# 获取最新的文章数据 # 获取最新的文章数据
@ -86,7 +91,7 @@ def get_latest_articles_from_link(url, count=5, last_articles_path="./rss_subscr
if article['link'] not in last_titles: if article['link'] not in last_titles:
updated_articles.append(article) updated_articles.append(article)
print(f"{url} 获取到 {len(latest_articles)} 篇文章,其中 {len(updated_articles)} 篇为新文章") logging.info(f"{url} 获取到 {len(latest_articles)} 篇文章,其中 {len(updated_articles)} 篇为新文章")
# 更新本地存储的文章数据 # 更新本地存储的文章数据
with open(local_file, 'w', encoding='utf-8') as file: with open(local_file, 'w', encoding='utf-8') as file:

40
run.py
View File

@ -4,24 +4,30 @@ from friend_circle_lite.get_conf import load_config
from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues
from push_rss_update.send_email import send_emails from push_rss_update.send_email import send_emails
import logging
import json import json
import sys import sys
import os import os
# 日志记录
logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s')
# 爬虫部分内容 # 爬虫部分内容
config = load_config("./conf.yaml") config = load_config("./conf.yaml")
if config["spider_settings"]["enable"]: if config["spider_settings"]["enable"]:
print("爬虫已启用") logging.info("爬虫已启用")
json_url = config['spider_settings']['json_url'] json_url = config['spider_settings']['json_url']
article_count = config['spider_settings']['article_count'] article_count = config['spider_settings']['article_count']
specific_RSS = config['specific_RSS'] specific_RSS = config['specific_RSS']
print("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count)) logging.info("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count) result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
if config["spider_settings"]["merge_result"]["enable"]: if config["spider_settings"]["merge_result"]["enable"]:
marge_json_url = config['spider_settings']["merge_result"]['merge_json_url'] marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
print("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json")) logging.info("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
result = marge_data_from_json_url(result, marge_json_url + "/all.json") result = marge_data_from_json_url(result, marge_json_url + "/all.json")
lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json") lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json")
logging.info("数据获取完毕,目前共有 {count} 位好友的动态,正在处理数据".format(count=len(result.get("article_data", []))))
result = deal_with_large_data(result) result = deal_with_large_data(result)
with open("all.json", "w", encoding="utf-8") as f: with open("all.json", "w", encoding="utf-8") as f:
@ -30,20 +36,22 @@ if config["spider_settings"]["enable"]:
json.dump(lost_friends, f, ensure_ascii=False, indent=2) json.dump(lost_friends, f, ensure_ascii=False, indent=2)
if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]: if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
print("获取smtp配置信息") logging.info("推送功能已启用,正在准备推送,获取配置信息")
email_settings = config["smtp"] email_settings = config["smtp"]
email = email_settings["email"] email = email_settings["email"]
server = email_settings["server"] server = email_settings["server"]
port = email_settings["port"] port = email_settings["port"]
use_tls = email_settings["use_tls"] use_tls = email_settings["use_tls"]
password = os.getenv("SMTP_PWD") password = os.getenv("SMTP_PWD")
print("密码检测是否存在:", password[:2], "****", password[-2:]) logging.info("SMTP 服务器信息:{server}:{port}".format(server=server, port=port))
logging.info("密码:{pwd}************".format(pwd=password[:3]))
if config["email_push"]["enable"]: if config["email_push"]["enable"]:
print("邮件推送已启用") logging.info("邮件推送已启用")
logging.info("抱歉,目前暂未实现功能")
if config["rss_subscribe"]["enable"]: if config["rss_subscribe"]["enable"]:
print("RSS通过issue订阅已启用") logging.info("RSS 订阅推送已启用")
# 获取并强制转换为字符串 # 获取并强制转换为字符串
# 尝试从环境变量获取 FCL_REPO # 尝试从环境变量获取 FCL_REPO
fcl_repo = os.getenv('FCL_REPO') fcl_repo = os.getenv('FCL_REPO')
@ -51,13 +59,13 @@ if config["rss_subscribe"]["enable"]:
# 提取 github_username 和 github_repo # 提取 github_username 和 github_repo
if fcl_repo: if fcl_repo:
github_username, github_repo = fcl_repo.split('/') github_username, github_repo = fcl_repo.split('/')
print(f"从环境变量获取到的 GitHub Username: {github_username}")
print(f"从环境变量获取到的 GitHub Repo: {github_repo}")
else: else:
github_username = str(config["rss_subscribe"]["github_username"]).strip() github_username = str(config["rss_subscribe"]["github_username"]).strip()
github_repo = str(config["rss_subscribe"]["github_repo"]).strip() github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
print(f"从配置文件获取到的 GitHub Username: {github_username}")
print(f"从配置文件获取到的 GitHub Repo: {github_repo}") # 输出 github_username 和 github_repo
logging.info("github_username: {github_username}".format(github_username=github_username))
logging.info("github_repo: {github_repo}".format(github_repo=github_repo))
your_blog_url = config["rss_subscribe"]["your_blog_url"] your_blog_url = config["rss_subscribe"]["your_blog_url"]
email_template = config["rss_subscribe"]["email_template"] email_template = config["rss_subscribe"]["email_template"]
@ -69,18 +77,18 @@ if config["rss_subscribe"]["enable"]:
count=5, count=5,
last_articles_path="./rss_subscribe/last_articles.json" last_articles_path="./rss_subscribe/last_articles.json"
) )
print("最新文章为:", latest_articles) logging.info("获取到的最新文章为:{latest_articles}".format(latest_articles=latest_articles))
if latest_articles == None: if latest_articles == None:
print("没有新文章") logging.info("无未进行推送的新文章")
else: else:
github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200" github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200"
print("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url)) logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
email_list = extract_emails_from_issues(github_api_url) email_list = extract_emails_from_issues(github_api_url)
if email_list == None: if email_list == None:
print("无邮箱列表") logging.info("无邮箱列表,请检查您的订阅列表是否有订阅者或订阅格式是否正确")
sys.exit(0) sys.exit(0)
else: else:
print("获取到的邮箱列表为:", email_list) logging.info("获取到的邮箱列表为:{email_list}".format(email_list=email_list))
# 循环latest_articles发送邮件 # 循环latest_articles发送邮件
for article in latest_articles: for article in latest_articles:
template_data = { template_data = {