🤗更加合理的请求头

This commit is contained in:
2025-05-03 15:45:52 +08:00
parent ec4328cbe0
commit 5b47d2d3bc
3 changed files with 163 additions and 73 deletions

View File

@ -1,14 +1,36 @@
import logging
from datetime import datetime, timedelta, timezone
import re
from typing import Any
from urllib.parse import urljoin, urlparse
from dateutil import parser
import requests
import re
import feedparser
from concurrent.futures import ThreadPoolExecutor, as_completed
# 标准化的请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
HEADERS_JSON = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 "
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
),
"X-Friend-Circle": "1.0"
}
HEADERS_XML = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 "
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
),
"Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"X-Friend-Circle": "1.0"
}
timeout = (10, 15) # 连接超时和读取超时防止requests接受时间过长
@ -86,7 +108,7 @@ def check_feed(blog_url, session):
for feed_type, path in possible_feeds:
feed_url = blog_url.rstrip('/') + path
try:
response = session.get(feed_url, headers=headers, timeout=timeout)
response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout)
if response.status_code == 200:
return [feed_type, feed_url]
except requests.RequestException:
@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''):
dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
"""
try:
response = session.get(url, headers=headers, timeout=timeout)
response = session.get(url, headers=HEADERS_XML, timeout=timeout)
response.encoding = response.apparent_encoding or 'utf-8'
feed = feedparser.parse(response.text)
result = {
'website_name': feed.feed.title if 'title' in feed.feed else '',
'author': feed.feed.author if 'author' in feed.feed else '',
'link': feed.feed.link if 'link' in feed.feed else '',
'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore
'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore
'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore
'articles': []
}
@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''):
logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
# 处理链接中可能存在的错误比如ip或localhost
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore
article = {
'title': entry.title if 'title' in entry else '',
@ -177,7 +199,19 @@ def replace_non_domain(link: str, blog_url: str) -> str:
# path = re.sub(r'^https?://[^/]+', '', link)
# print(path)
return link
try:
parsed = urlparse(link)
if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc): # IP地址或localhost
# 提取 path + query
path = parsed.path or '/'
if parsed.query:
path += '?' + parsed.query
return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/'))
else:
return link # 合法域名则返回原链接
except Exception as e:
logging.warning(f"替换链接时出错:{link}, error: {e}")
return link
def process_friend(friend, session, count, specific_RSS=[]):
"""
@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5):
session = requests.Session()
try:
response = session.get(json_url, headers=headers, timeout=timeout)
response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout)
friends_data = response.json()
except Exception as e:
logging.error(f"无法获取链接:{json_url} {e}", exc_info=True)
@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url):
dict: 合并后的文章信息字典,已去重处理
"""
try:
response = requests.get(marge_json_url, headers=headers, timeout=timeout)
response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout)
marge_data = response.json()
except Exception as e:
logging.error(f"无法获取链接:{marge_json_url},出现的问题为:{e}", exc_info=True)