🤔将获取rss的函数改为多线程(dev)测试中

2024-11-23 00:43:43 +08:00
parent 55d88561b2
commit aa14d1b706
4 changed files with 95 additions and 43 deletions
--- a/friend_circle_lite/get_info.py
+++ b/friend_circle_lite/get_info.py
@@ -2,15 +2,16 @@ import logging
 from datetime import datetime, timedelta, timezone
 from dateutil import parser
 import requests
+import re
 import feedparser
 from concurrent.futures import ThreadPoolExecutor, as_completed

 # 设置日志配置
-logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s')
+logging.basicConfig(level=logging.INFO, format='🤪%(levelname)s: %(message)s')

 # 标准化的请求头
 headers = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
 }

 timeout = (10, 15) # 连接超时和读取超时，防止requests接受时间过长
@@ -55,16 +56,16 @@ def format_published_time(time_str):
    return shanghai_time.strftime('%Y-%m-%d %H:%M')


-def check_feed(blog_url, session):
-    """
-    检查博客的 RSS 或 Atom 订阅链接。

-    此函数接受一个博客地址，尝试在其后拼接 '/atom.xml', '/rss2.xml' 和 '/feed'，并检查这些链接是否可访问。
-    Atom 优先，如果都不能访问，则返回 ['none', 源地址]。
+def check_feed(blog_url, session, headers=None, timeout=10):
+    """
+    检查博客的 RSS 或 Atom 订阅链接，使用多线程提高效率，禁止重定向。

    参数：
    blog_url (str): 博客的基础 URL。
    session (requests.Session): 用于请求的会话对象。
+    headers (dict, 可选): 自定义请求头。
+    timeout (int, 可选): 请求的超时限制，默认为 10 秒。

    返回：
    list: 包含类型和拼接后的链接的列表。如果 atom 链接可访问，则返回 ['atom', atom_url]；
@@ -75,26 +76,40 @@ def check_feed(blog_url, session):
    
    possible_feeds = [
        ('atom', '/atom.xml'),
-        ('rss', '/rss.xml'), # 2024-07-26 添加 /rss.xml内容的支持
+        ('rss', '/rss.xml'),
        ('rss2', '/rss2.xml'),
        ('feed', '/feed'),
-        ('feed2', '/feed.xml'), # 2024-07-26 添加 /feed.xml内容的支持
+        ('feed2', '/feed.xml'),
        ('feed3', '/feed/'),
-        ('index', '/index.xml') # 2024-07-25 添加 /index.xml内容的支持
+        ('index', '/index.xml')
    ]

-    for feed_type, path in possible_feeds:
+    def fetch_feed(feed_type, path):
        feed_url = blog_url.rstrip('/') + path
        try:
-            response = session.get(feed_url, headers=headers, timeout=timeout)
+            response = session.get(feed_url, headers=headers, timeout=timeout, allow_redirects=False)
            if response.status_code == 200:
                return [feed_type, feed_url]
+            elif response.status_code in [301, 302, 303]:
+                return None  # 重定向，不处理
        except requests.RequestException:
-            continue
+            return None  # 请求异常，不处理
+
+    # 使用 ThreadPoolExecutor 执行多个线程
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(fetch_feed, feed_type, path) for feed_type, path in possible_feeds]
+
+        # 等待线程完成并获取结果
+        for future in as_completed(futures):
+            result = future.result()
+            if result:
+                return result  # 如果找到有效的订阅链接，返回
+
    logging.warning(f"无法找到 {blog_url} 的订阅链接")
    return ['none', blog_url]

-def parse_feed(url, session, count=5):
+
+def parse_feed(url, session, count=5, blog_url=''):
    """
    解析 Atom 或 RSS2 feed 并返回包含网站名称、作者、原链接和每篇文章详细内容的字典。

@@ -121,7 +136,7 @@ def parse_feed(url, session, count=5):
            'articles': []
        }
        
-        for i, entry in enumerate(feed.entries):
+        for _ , entry in enumerate(feed.entries):
            
            if 'published' in entry:
                published = format_published_time(entry.published)
@@ -131,11 +146,15 @@ def parse_feed(url, session, count=5):
                logging.warning(f"文章 {entry.title} 未包含发布时间，已使用更新时间 {published}")
            else:
                published = ''
-                logging.warning(f"文章 {entry.title} 未包含任何时间信息")
+                logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
+            
+            # 处理链接中可能存在的错误，比如ip或localhost
+            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
+            
            article = {
                'title': entry.title if 'title' in entry else '',
                'author': result['author'],
-                'link': entry.link if 'link' in entry else '',
+                'link': article_link,
                'published': published,
                'summary': entry.summary if 'summary' in entry else '',
                'content': entry.content[0].value if 'content' in entry and entry.content else entry.description if 'description' in entry else ''
@@ -149,7 +168,7 @@ def parse_feed(url, session, count=5):
        
        return result
    except Exception as e:
-        logging.error(f"无法解析FEED地址：{url} ，请自行排查原因！", exc_info=True)
+        logging.error(f"无法解析FEED地址：{url} ，请自行排查原因！")
        return {
            'website_name': '',
            'author': '',
@@ -157,6 +176,23 @@ def parse_feed(url, session, count=5):
            'articles': []
        }

+def replace_non_domain(link: str, blog_url: str) -> str:
+    """
+    暂未实现
+    检测并替换字符串中的非正常域名部分（如 IP 地址或 localhost），替换为 blog_url。
+    替换后强制使用 https，且考虑 blog_url 尾部是否有斜杠。
+
+    :param link: 原始地址字符串
+    :param blog_url: 替换为的博客地址
+    :return: 替换后的地址字符串
+    """
+    
+    # 提取link中的路径部分，无需协议和域名
+    # path = re.sub(r'^https?://[^/]+', '', link)
+    # print(path)
+    
+    return link
+
 def process_friend(friend, session, count, specific_RSS=[]):
    """
    处理单个朋友的博客信息。
@@ -182,10 +218,10 @@ def process_friend(friend, session, count, specific_RSS=[]):
        logging.info(f"“{name}”的博客“ {blog_url} ”为特定RSS源“ {feed_url} ”")
    else:
        feed_type, feed_url = check_feed(blog_url, session)
-        logging.info(f"“{name}”的博客“{blog_url}”的feed类型为“{feed_type}”")
+        logging.info(f"“{name}”的博客“ {blog_url} ”的feed类型为“{feed_type}”, feed地址为“ {feed_url} ”")

    if feed_type != 'none':
-        feed_info = parse_feed(feed_url, session, count)
+        feed_info = parse_feed(feed_url, session, count, blog_url)
        articles = [
            {
                'title': article['title'],
--- a/push_rss_update/send_email.py
+++ b/push_rss_update/send_email.py
@@ -1,9 +1,13 @@
+import logging
 import smtplib
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from jinja2 import Environment, FileSystemLoader
 import os

+logging.basicConfig(level=logging.INFO, format='😬%(levelname)s: %(message)s')
+
+
 def email_sender(
    target_email, 
    sender_email, 
@@ -56,7 +60,7 @@ def email_sender(
            server.sendmail(sender_email, target_email, msg.as_string())
            print(f'邮件已发送到 {target_email}')
    except Exception as e:
-        print(f'无法发送邮件到 {target_email}. 错误: {e}')
+        logging.error(f'邮件发送失败，目标地址: {target_email}，错误信息: {e}')

 def send_emails(emails, sender_email, smtp_server, port, password, subject, body, template_path=None, template_data=None, use_tls=True):
    """
@@ -75,6 +79,5 @@ def send_emails(emails, sender_email, smtp_server, port, password, subject, body
    use_tls (bool): 是否使用 TLS 加密。默认为 True。
    """
    for email in emails:
-        print(f'正在发送邮件到 {email}')
-        print(f'---------------------------\n邮件主题: {subject}\n邮件内容: {body}\n发件人: {sender_email}\n---------------------------')
+        logging.info(f'正在发送邮件到 {email}，邮件内容: {subject}')
        email_sender(email, sender_email, smtp_server, port, password, subject, body, template_path, template_data, use_tls)
--- a/rss_subscribe/push_article_update.py
+++ b/rss_subscribe/push_article_update.py
@@ -1,9 +1,14 @@
+import logging
 import requests
 import re
 from friend_circle_lite.get_info import check_feed, parse_feed
 import json
 import os

+# 日志配置
+logging.basicConfig(level=logging.INFO, format='⭐%(levelname)s: %(message)s')
+
+
 # 标准化的请求头
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
@@ -29,7 +34,7 @@ def extract_emails_from_issues(api_url):
        response.raise_for_status()
        issues = response.json()
    except Exception as e:
-        print(f"无法获取该链接：{api_url}\n出现的问题为：{e}")
+        logging.error(f"无法获取 GitHub issues 数据，错误信息: {e}")
        return None

    email_pattern = re.compile(r'^\[邮箱订阅\](.+)$')
@@ -62,7 +67,7 @@ def get_latest_articles_from_link(url, count=5, last_articles_path="./rss_subscr
    session = requests.Session()
    feed_type, feed_url = check_feed(url, session)
    if feed_type == 'none':
-        print(f"无法访问 {url} 的 feed")
+        logging.error(f"无法获取 {url} 的文章数据")
        return None

    # 获取最新的文章数据
@@ -86,7 +91,7 @@ def get_latest_articles_from_link(url, count=5, last_articles_path="./rss_subscr
        if article['link'] not in last_titles:
            updated_articles.append(article)
    
-    print(f"从 {url} 获取到 {len(latest_articles)} 篇文章，其中 {len(updated_articles)} 篇为新文章")
+    logging.info(f"从 {url} 获取到 {len(latest_articles)} 篇文章，其中 {len(updated_articles)} 篇为新文章")

    # 更新本地存储的文章数据
    with open(local_file, 'w', encoding='utf-8') as file:
--- a/run.py
+++ b/run.py
@@ -4,24 +4,30 @@ from friend_circle_lite.get_conf import load_config
 from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues
 from push_rss_update.send_email import send_emails

+import logging
 import json
 import sys
 import os

+# 日志记录
+logging.basicConfig(level=logging.INFO, format='😋%(levelname)s: %(message)s')
+
+
 # 爬虫部分内容
 config = load_config("./conf.yaml")
 if config["spider_settings"]["enable"]:
-    print("爬虫已启用")
+    logging.info("爬虫已启用")
    json_url = config['spider_settings']['json_url']
    article_count = config['spider_settings']['article_count']
    specific_RSS = config['specific_RSS']
-    print("正在从 {json_url} 中获取，每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
+    logging.info("正在从 {json_url} 中获取，每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
    result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
    if config["spider_settings"]["merge_result"]["enable"]:
        marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
-        print("合并数据功能开启，从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
+        logging.info("合并数据功能开启，从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
        result = marge_data_from_json_url(result, marge_json_url + "/all.json")
        lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json")
+    logging.info("数据获取完毕，目前共有 {count} 位好友的动态，正在处理数据".format(count=len(result.get("article_data", []))))
    result = deal_with_large_data(result)

    with open("all.json", "w", encoding="utf-8") as f:
@@ -30,20 +36,22 @@ if config["spider_settings"]["enable"]:
        json.dump(lost_friends, f, ensure_ascii=False, indent=2)

 if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
-    print("获取smtp配置信息")
+    logging.info("推送功能已启用，正在准备推送，获取配置信息")
    email_settings = config["smtp"]
    email = email_settings["email"]
    server = email_settings["server"]
    port = email_settings["port"]
    use_tls = email_settings["use_tls"]
    password = os.getenv("SMTP_PWD")
-    print("密码检测是否存在：", password[:2], "****", password[-2:])
+    logging.info("SMTP 服务器信息：{server}:{port}".format(server=server, port=port))
+    logging.info("密码：{pwd}************".format(pwd=password[:3]))

 if config["email_push"]["enable"]:
-    print("邮件推送已启用")
+    logging.info("邮件推送已启用")
+    logging.info("抱歉，目前暂未实现功能")
    
 if config["rss_subscribe"]["enable"]:
-    print("RSS通过issue订阅已启用")
+    logging.info("RSS 订阅推送已启用")
    # 获取并强制转换为字符串
    # 尝试从环境变量获取 FCL_REPO
    fcl_repo = os.getenv('FCL_REPO')
@@ -51,13 +59,13 @@ if config["rss_subscribe"]["enable"]:
    # 提取 github_username 和 github_repo
    if fcl_repo:
        github_username, github_repo = fcl_repo.split('/')
-        print(f"从环境变量获取到的 GitHub Username: {github_username}")
-        print(f"从环境变量获取到的 GitHub Repo: {github_repo}")
    else:
        github_username = str(config["rss_subscribe"]["github_username"]).strip()
        github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
-        print(f"从配置文件获取到的 GitHub Username: {github_username}")
-        print(f"从配置文件获取到的 GitHub Repo: {github_repo}")
+    
+    # 输出 github_username 和 github_repo
+    logging.info("github_username: {github_username}".format(github_username=github_username))
+    logging.info("github_repo: {github_repo}".format(github_repo=github_repo))
    
    your_blog_url = config["rss_subscribe"]["your_blog_url"]
    email_template = config["rss_subscribe"]["email_template"]
@@ -69,18 +77,18 @@ if config["rss_subscribe"]["enable"]:
        count=5,
        last_articles_path="./rss_subscribe/last_articles.json"
        )
-    print("最新文章为：", latest_articles)
+    logging.info("获取到的最新文章为：{latest_articles}".format(latest_articles=latest_articles))
    if latest_articles == None:
-        print("没有新文章")
+        logging.info("无未进行推送的新文章")
    else:
        github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200"
-        print("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
+        logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
        email_list = extract_emails_from_issues(github_api_url)
        if email_list == None:
-            print("无邮箱列表")
+            logging.info("无邮箱列表，请检查您的订阅列表是否有订阅者或订阅格式是否正确")
            sys.exit(0)
        else:
-            print("获取到的邮箱列表为：", email_list)
+            logging.info("获取到的邮箱列表为：{email_list}".format(email_list=email_list))
        # 循环latest_articles，发送邮件
        for article in latest_articles:
            template_data = {