🤔将获取rss的函数改为多线程(dev)测试中
This commit is contained in:
		@@ -2,15 +2,16 @@ import logging
 | 
			
		||||
from datetime import datetime, timedelta, timezone
 | 
			
		||||
from dateutil import parser
 | 
			
		||||
import requests
 | 
			
		||||
import re
 | 
			
		||||
import feedparser
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor, as_completed
 | 
			
		||||
 | 
			
		||||
# 设置日志配置
 | 
			
		||||
logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s')
 | 
			
		||||
logging.basicConfig(level=logging.INFO, format='🤪%(levelname)s: %(message)s')
 | 
			
		||||
 | 
			
		||||
# 标准化的请求头
 | 
			
		||||
headers = {
 | 
			
		||||
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
 | 
			
		||||
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
timeout = (10, 15) # 连接超时和读取超时,防止requests接受时间过长
 | 
			
		||||
@@ -55,16 +56,16 @@ def format_published_time(time_str):
 | 
			
		||||
    return shanghai_time.strftime('%Y-%m-%d %H:%M')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_feed(blog_url, session):
 | 
			
		||||
    """
 | 
			
		||||
    检查博客的 RSS 或 Atom 订阅链接。
 | 
			
		||||
 | 
			
		||||
    此函数接受一个博客地址,尝试在其后拼接 '/atom.xml', '/rss2.xml' 和 '/feed',并检查这些链接是否可访问。
 | 
			
		||||
    Atom 优先,如果都不能访问,则返回 ['none', 源地址]。
 | 
			
		||||
def check_feed(blog_url, session, headers=None, timeout=10):
 | 
			
		||||
    """
 | 
			
		||||
    检查博客的 RSS 或 Atom 订阅链接,使用多线程提高效率,禁止重定向。
 | 
			
		||||
 | 
			
		||||
    参数:
 | 
			
		||||
    blog_url (str): 博客的基础 URL。
 | 
			
		||||
    session (requests.Session): 用于请求的会话对象。
 | 
			
		||||
    headers (dict, 可选): 自定义请求头。
 | 
			
		||||
    timeout (int, 可选): 请求的超时限制,默认为 10 秒。
 | 
			
		||||
 | 
			
		||||
    返回:
 | 
			
		||||
    list: 包含类型和拼接后的链接的列表。如果 atom 链接可访问,则返回 ['atom', atom_url];
 | 
			
		||||
@@ -75,26 +76,40 @@ def check_feed(blog_url, session):
 | 
			
		||||
    
 | 
			
		||||
    possible_feeds = [
 | 
			
		||||
        ('atom', '/atom.xml'),
 | 
			
		||||
        ('rss', '/rss.xml'), # 2024-07-26 添加 /rss.xml内容的支持
 | 
			
		||||
        ('rss', '/rss.xml'),
 | 
			
		||||
        ('rss2', '/rss2.xml'),
 | 
			
		||||
        ('feed', '/feed'),
 | 
			
		||||
        ('feed2', '/feed.xml'), # 2024-07-26 添加 /feed.xml内容的支持
 | 
			
		||||
        ('feed2', '/feed.xml'),
 | 
			
		||||
        ('feed3', '/feed/'),
 | 
			
		||||
        ('index', '/index.xml') # 2024-07-25 添加 /index.xml内容的支持
 | 
			
		||||
        ('index', '/index.xml')
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    for feed_type, path in possible_feeds:
 | 
			
		||||
    def fetch_feed(feed_type, path):
 | 
			
		||||
        feed_url = blog_url.rstrip('/') + path
 | 
			
		||||
        try:
 | 
			
		||||
            response = session.get(feed_url, headers=headers, timeout=timeout)
 | 
			
		||||
            response = session.get(feed_url, headers=headers, timeout=timeout, allow_redirects=False)
 | 
			
		||||
            if response.status_code == 200:
 | 
			
		||||
                return [feed_type, feed_url]
 | 
			
		||||
            elif response.status_code in [301, 302, 303]:
 | 
			
		||||
                return None  # 重定向,不处理
 | 
			
		||||
        except requests.RequestException:
 | 
			
		||||
            continue
 | 
			
		||||
            return None  # 请求异常,不处理
 | 
			
		||||
 | 
			
		||||
    # 使用 ThreadPoolExecutor 执行多个线程
 | 
			
		||||
    with ThreadPoolExecutor() as executor:
 | 
			
		||||
        futures = [executor.submit(fetch_feed, feed_type, path) for feed_type, path in possible_feeds]
 | 
			
		||||
 | 
			
		||||
        # 等待线程完成并获取结果
 | 
			
		||||
        for future in as_completed(futures):
 | 
			
		||||
            result = future.result()
 | 
			
		||||
            if result:
 | 
			
		||||
                return result  # 如果找到有效的订阅链接,返回
 | 
			
		||||
 | 
			
		||||
    logging.warning(f"无法找到 {blog_url} 的订阅链接")
 | 
			
		||||
    return ['none', blog_url]
 | 
			
		||||
 | 
			
		||||
def parse_feed(url, session, count=5):
 | 
			
		||||
 | 
			
		||||
def parse_feed(url, session, count=5, blog_url=''):
 | 
			
		||||
    """
 | 
			
		||||
    解析 Atom 或 RSS2 feed 并返回包含网站名称、作者、原链接和每篇文章详细内容的字典。
 | 
			
		||||
 | 
			
		||||
@@ -121,7 +136,7 @@ def parse_feed(url, session, count=5):
 | 
			
		||||
            'articles': []
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        for i, entry in enumerate(feed.entries):
 | 
			
		||||
        for _ , entry in enumerate(feed.entries):
 | 
			
		||||
            
 | 
			
		||||
            if 'published' in entry:
 | 
			
		||||
                published = format_published_time(entry.published)
 | 
			
		||||
@@ -131,11 +146,15 @@ def parse_feed(url, session, count=5):
 | 
			
		||||
                logging.warning(f"文章 {entry.title} 未包含发布时间,已使用更新时间 {published}")
 | 
			
		||||
            else:
 | 
			
		||||
                published = ''
 | 
			
		||||
                logging.warning(f"文章 {entry.title} 未包含任何时间信息")
 | 
			
		||||
                logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
 | 
			
		||||
            
 | 
			
		||||
            # 处理链接中可能存在的错误,比如ip或localhost
 | 
			
		||||
            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
 | 
			
		||||
            
 | 
			
		||||
            article = {
 | 
			
		||||
                'title': entry.title if 'title' in entry else '',
 | 
			
		||||
                'author': result['author'],
 | 
			
		||||
                'link': entry.link if 'link' in entry else '',
 | 
			
		||||
                'link': article_link,
 | 
			
		||||
                'published': published,
 | 
			
		||||
                'summary': entry.summary if 'summary' in entry else '',
 | 
			
		||||
                'content': entry.content[0].value if 'content' in entry and entry.content else entry.description if 'description' in entry else ''
 | 
			
		||||
@@ -149,7 +168,7 @@ def parse_feed(url, session, count=5):
 | 
			
		||||
        
 | 
			
		||||
        return result
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logging.error(f"无法解析FEED地址:{url} ,请自行排查原因!", exc_info=True)
 | 
			
		||||
        logging.error(f"无法解析FEED地址:{url} ,请自行排查原因!")
 | 
			
		||||
        return {
 | 
			
		||||
            'website_name': '',
 | 
			
		||||
            'author': '',
 | 
			
		||||
@@ -157,6 +176,23 @@ def parse_feed(url, session, count=5):
 | 
			
		||||
            'articles': []
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
def replace_non_domain(link: str, blog_url: str) -> str:
 | 
			
		||||
    """
 | 
			
		||||
    暂未实现
 | 
			
		||||
    检测并替换字符串中的非正常域名部分(如 IP 地址或 localhost),替换为 blog_url。
 | 
			
		||||
    替换后强制使用 https,且考虑 blog_url 尾部是否有斜杠。
 | 
			
		||||
 | 
			
		||||
    :param link: 原始地址字符串
 | 
			
		||||
    :param blog_url: 替换为的博客地址
 | 
			
		||||
    :return: 替换后的地址字符串
 | 
			
		||||
    """
 | 
			
		||||
    
 | 
			
		||||
    # 提取link中的路径部分,无需协议和域名
 | 
			
		||||
    # path = re.sub(r'^https?://[^/]+', '', link)
 | 
			
		||||
    # print(path)
 | 
			
		||||
    
 | 
			
		||||
    return link
 | 
			
		||||
 | 
			
		||||
def process_friend(friend, session, count, specific_RSS=[]):
 | 
			
		||||
    """
 | 
			
		||||
    处理单个朋友的博客信息。
 | 
			
		||||
@@ -179,13 +215,13 @@ def process_friend(friend, session, count, specific_RSS=[]):
 | 
			
		||||
    if rss_feed:
 | 
			
		||||
        feed_url = rss_feed
 | 
			
		||||
        feed_type = 'specific'
 | 
			
		||||
        logging.info(f"“{name}”的博客“{blog_url}”为特定RSS源“{feed_url}”")
 | 
			
		||||
        logging.info(f"“{name}”的博客“ {blog_url} ”为特定RSS源“ {feed_url} ”")
 | 
			
		||||
    else:
 | 
			
		||||
        feed_type, feed_url = check_feed(blog_url, session)
 | 
			
		||||
        logging.info(f"“{name}”的博客“{blog_url}”的feed类型为“{feed_type}”")
 | 
			
		||||
        logging.info(f"“{name}”的博客“ {blog_url} ”的feed类型为“{feed_type}”, feed地址为“ {feed_url} ”")
 | 
			
		||||
 | 
			
		||||
    if feed_type != 'none':
 | 
			
		||||
        feed_info = parse_feed(feed_url, session, count)
 | 
			
		||||
        feed_info = parse_feed(feed_url, session, count, blog_url)
 | 
			
		||||
        articles = [
 | 
			
		||||
            {
 | 
			
		||||
                'title': article['title'],
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user