🤗更加合理的请求头
This commit is contained in:
		@@ -1,14 +1,36 @@
 | 
			
		||||
import logging
 | 
			
		||||
from datetime import datetime, timedelta, timezone
 | 
			
		||||
import re
 | 
			
		||||
from typing import Any
 | 
			
		||||
from urllib.parse import urljoin, urlparse
 | 
			
		||||
from dateutil import parser
 | 
			
		||||
import requests
 | 
			
		||||
import re
 | 
			
		||||
import feedparser
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor, as_completed
 | 
			
		||||
 | 
			
		||||
# 标准化的请求头
 | 
			
		||||
headers = {
 | 
			
		||||
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
 | 
			
		||||
HEADERS_JSON = {
 | 
			
		||||
    "User-Agent": (
 | 
			
		||||
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 | 
			
		||||
        "AppleWebKit/537.36 (KHTML, like Gecko) "
 | 
			
		||||
        "Chrome/123.0.0.0 Safari/537.36 "
 | 
			
		||||
        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
 | 
			
		||||
    ),
 | 
			
		||||
    "X-Friend-Circle": "1.0"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
HEADERS_XML = {
 | 
			
		||||
    "User-Agent": (
 | 
			
		||||
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 | 
			
		||||
        "AppleWebKit/537.36 (KHTML, like Gecko) "
 | 
			
		||||
        "Chrome/123.0.0.0 Safari/537.36 "
 | 
			
		||||
        "(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
 | 
			
		||||
    ),
 | 
			
		||||
    "Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8",
 | 
			
		||||
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
 | 
			
		||||
    "Accept-Encoding": "gzip, deflate, br",
 | 
			
		||||
    "Connection": "keep-alive",
 | 
			
		||||
    "X-Friend-Circle": "1.0"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
timeout = (10, 15) # 连接超时和读取超时,防止requests接受时间过长
 | 
			
		||||
@@ -86,7 +108,7 @@ def check_feed(blog_url, session):
 | 
			
		||||
    for feed_type, path in possible_feeds:
 | 
			
		||||
        feed_url = blog_url.rstrip('/') + path
 | 
			
		||||
        try:
 | 
			
		||||
            response = session.get(feed_url, headers=headers, timeout=timeout)
 | 
			
		||||
            response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout)
 | 
			
		||||
            if response.status_code == 200:
 | 
			
		||||
                return [feed_type, feed_url]
 | 
			
		||||
        except requests.RequestException:
 | 
			
		||||
@@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''):
 | 
			
		||||
    dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        response = session.get(url, headers=headers, timeout=timeout)
 | 
			
		||||
        response = session.get(url, headers=HEADERS_XML, timeout=timeout)
 | 
			
		||||
        response.encoding = response.apparent_encoding or 'utf-8'
 | 
			
		||||
        feed = feedparser.parse(response.text)
 | 
			
		||||
        
 | 
			
		||||
        result = {
 | 
			
		||||
            'website_name': feed.feed.title if 'title' in feed.feed else '',
 | 
			
		||||
            'author': feed.feed.author if 'author' in feed.feed else '',
 | 
			
		||||
            'link': feed.feed.link if 'link' in feed.feed else '',
 | 
			
		||||
            'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore
 | 
			
		||||
            'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore
 | 
			
		||||
            'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore
 | 
			
		||||
            'articles': []
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
@@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''):
 | 
			
		||||
                logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
 | 
			
		||||
            
 | 
			
		||||
            # 处理链接中可能存在的错误,比如ip或localhost
 | 
			
		||||
            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
 | 
			
		||||
            article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore
 | 
			
		||||
            
 | 
			
		||||
            article = {
 | 
			
		||||
                'title': entry.title if 'title' in entry else '',
 | 
			
		||||
@@ -177,7 +199,19 @@ def replace_non_domain(link: str, blog_url: str) -> str:
 | 
			
		||||
    # path = re.sub(r'^https?://[^/]+', '', link)
 | 
			
		||||
    # print(path)
 | 
			
		||||
    
 | 
			
		||||
    return link
 | 
			
		||||
    try:
 | 
			
		||||
        parsed = urlparse(link)
 | 
			
		||||
        if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc):  # IP地址或localhost
 | 
			
		||||
            # 提取 path + query
 | 
			
		||||
            path = parsed.path or '/'
 | 
			
		||||
            if parsed.query:
 | 
			
		||||
                path += '?' + parsed.query
 | 
			
		||||
            return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/'))
 | 
			
		||||
        else:
 | 
			
		||||
            return link  # 合法域名则返回原链接
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logging.warning(f"替换链接时出错:{link}, error: {e}")
 | 
			
		||||
        return link
 | 
			
		||||
 | 
			
		||||
def process_friend(friend, session, count, specific_RSS=[]):
 | 
			
		||||
    """
 | 
			
		||||
@@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5):
 | 
			
		||||
    session = requests.Session()
 | 
			
		||||
    
 | 
			
		||||
    try:
 | 
			
		||||
        response = session.get(json_url, headers=headers, timeout=timeout)
 | 
			
		||||
        response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout)
 | 
			
		||||
        friends_data = response.json()
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logging.error(f"无法获取链接:{json_url} :{e}", exc_info=True)
 | 
			
		||||
@@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url):
 | 
			
		||||
    dict: 合并后的文章信息字典,已去重处理
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        response = requests.get(marge_json_url, headers=headers, timeout=timeout)
 | 
			
		||||
        response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout)
 | 
			
		||||
        marge_data = response.json()
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logging.error(f"无法获取链接:{marge_json_url},出现的问题为:{e}", exc_info=True)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user