🤗更加合理的请求头

This commit is contained in:
LiuShen 2025-05-03 15:45:52 +08:00
parent ec4328cbe0
commit 5b47d2d3bc
3 changed files with 163 additions and 73 deletions

View File

@ -1,14 +1,36 @@
import logging
from datetime import datetime, timedelta, timezone
import re
from typing import Any
from urllib.parse import urljoin, urlparse
from dateutil import parser
import requests
import re
import feedparser
from concurrent.futures import ThreadPoolExecutor, as_completed
# 标准化的请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
HEADERS_JSON = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 "
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
),
"X-Friend-Circle": "1.0"
}
HEADERS_XML = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 "
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
),
"Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"X-Friend-Circle": "1.0"
}
timeout = (10, 15) # 连接超时和读取超时防止requests接受时间过长
@ -86,7 +108,7 @@ def check_feed(blog_url, session):
for feed_type, path in possible_feeds:
feed_url = blog_url.rstrip('/') + path
try:
response = session.get(feed_url, headers=headers, timeout=timeout)
response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout)
if response.status_code == 200:
return [feed_type, feed_url]
except requests.RequestException:
@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''):
dict: 包含网站名称作者原链接和每篇文章详细内容的字典
"""
try:
response = session.get(url, headers=headers, timeout=timeout)
response = session.get(url, headers=HEADERS_XML, timeout=timeout)
response.encoding = response.apparent_encoding or 'utf-8'
feed = feedparser.parse(response.text)
result = {
'website_name': feed.feed.title if 'title' in feed.feed else '',
'author': feed.feed.author if 'author' in feed.feed else '',
'link': feed.feed.link if 'link' in feed.feed else '',
'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore
'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore
'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore
'articles': []
}
@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''):
logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
# 处理链接中可能存在的错误比如ip或localhost
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore
article = {
'title': entry.title if 'title' in entry else '',
@ -177,6 +199,18 @@ def replace_non_domain(link: str, blog_url: str) -> str:
# path = re.sub(r'^https?://[^/]+', '', link)
# print(path)
try:
parsed = urlparse(link)
if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc): # IP地址或localhost
# 提取 path + query
path = parsed.path or '/'
if parsed.query:
path += '?' + parsed.query
return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/'))
else:
return link # 合法域名则返回原链接
except Exception as e:
logging.warning(f"替换链接时出错:{link}, error: {e}")
return link
def process_friend(friend, session, count, specific_RSS=[]):
@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5):
session = requests.Session()
try:
response = session.get(json_url, headers=headers, timeout=timeout)
response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout)
friends_data = response.json()
except Exception as e:
logging.error(f"无法获取链接:{json_url} {e}", exc_info=True)
@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url):
dict: 合并后的文章信息字典已去重处理
"""
try:
response = requests.get(marge_json_url, headers=headers, timeout=timeout)
response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout)
marge_data = response.json()
except Exception as e:
logging.error(f"无法获取链接:{marge_json_url},出现的问题为:{e}", exc_info=True)

View File

@ -6,10 +6,17 @@ import json
import os
# 标准化的请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
HEADERS_JSON = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36 "
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
),
"X-Friend-Circle": "1.0"
}
def extract_emails_from_issues(api_url):
"""
从GitHub issues API中提取以[e-mail]开头的title中的邮箱地址
@ -26,7 +33,7 @@ def extract_emails_from_issues(api_url):
}
"""
try:
response = requests.get(api_url, headers=headers)
response = requests.get(api_url, headers=HEADERS_JSON, timeout=10)
response.raise_for_status()
issues = response.json()
except Exception as e:

153
run.py
View File

@ -1,95 +1,136 @@
# 引入 check_feed 和 parse_feed 函数
from friend_circle_lite.get_info import fetch_and_process_data, sort_articles_by_time, marge_data_from_json_url, marge_errors_from_json_url, deal_with_large_data
from friend_circle_lite.get_conf import load_config
from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues
from push_rss_update.send_email import send_emails
import logging
import json
import sys
import os
# 日志记录
logging.basicConfig(level=logging.INFO, format='😋 %(levelname)s: %(message)s')
from friend_circle_lite.get_info import (
fetch_and_process_data,
marge_data_from_json_url,
marge_errors_from_json_url,
deal_with_large_data
)
from friend_circle_lite.get_conf import load_config
from rss_subscribe.push_article_update import (
get_latest_articles_from_link,
extract_emails_from_issues
)
from push_rss_update.send_email import send_emails
# ========== 日志设置 ==========
logging.basicConfig(
level=logging.INFO,
format='😋 %(levelname)s: %(message)s'
)
# 爬虫部分内容
# ========== 加载配置 ==========
config = load_config("./conf.yaml")
# ========== 爬虫模块 ==========
if config["spider_settings"]["enable"]:
logging.info("爬虫已启用")
logging.info("✅ 爬虫已启用")
json_url = config['spider_settings']['json_url']
article_count = config['spider_settings']['article_count']
specific_RSS = config['specific_RSS']
logging.info("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
specific_rss = config['specific_RSS']
logging.info(f"📥 正在从 {json_url} 获取数据,每个博客获取 {article_count} 篇文章")
result, lost_friends = fetch_and_process_data(
json_url=json_url,
specific_RSS=specific_rss,
count=article_count
) # type: ignore
if config["spider_settings"]["merge_result"]["enable"]:
marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
logging.info("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
result = marge_data_from_json_url(result, marge_json_url + "/all.json")
lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json")
logging.info("数据获取完毕,目前共有 {count} 位好友的动态,正在处理数据".format(count=len(result.get("article_data", []))))
merge_url = config['spider_settings']["merge_result"]['merge_json_url']
logging.info(f"🔀 合并功能开启,从 {merge_url} 获取外部数据")
result = marge_data_from_json_url(result, f"{merge_url}/all.json")
lost_friends = marge_errors_from_json_url(lost_friends, f"{merge_url}/errors.json")
article_count = len(result.get("article_data", []))
logging.info(f"📦 数据获取完毕,共有 {article_count} 位好友的动态,正在处理数据")
result = deal_with_large_data(result)
with open("all.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
with open("errors.json", "w", encoding="utf-8") as f:
json.dump(lost_friends, f, ensure_ascii=False, indent=2)
# ========== 推送准备 ==========
if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
logging.info("推送功能已启用,正在准备推送,获取配置信息")
email_settings = config["smtp"]
email = email_settings["email"]
server = email_settings["server"]
port = email_settings["port"]
use_tls = email_settings["use_tls"]
logging.info("📨 推送功能已启用,正在准备中...")
smtp_conf = config["smtp"]
sender_email = smtp_conf["email"]
server = smtp_conf["server"]
port = smtp_conf["port"]
use_tls = smtp_conf["use_tls"]
password = os.getenv("SMTP_PWD")
logging.info("SMTP 服务器信息:{server}:{port}".format(server=server, port=port))
logging.info("密码:{pwd}************".format(pwd=password[:3]))
logging.info(f"📡 SMTP 服务器:{server}:{port}")
if not password:
logging.error("❌ 环境变量 SMTP_PWD 未设置,无法发送邮件")
sys.exit(1)
else:
logging.info(f"🔐 密码(部分){password[:3]}*****")
# ========== 邮件推送(待实现)==========
if config["email_push"]["enable"]:
logging.info("邮件推送已启用")
logging.info("抱歉,目前暂未实现功能")
logging.info("📧 邮件推送已启用")
logging.info("⚠️ 抱歉,目前尚未实现邮件推送功能")
# ========== RSS 订阅推送 ==========
if config["rss_subscribe"]["enable"]:
logging.info("RSS 订阅推送已启用")
# 获取并强制转换为字符串
# 尝试从环境变量获取 FCL_REPO
fcl_repo = os.getenv('FCL_REPO')
logging.info("📰 RSS 订阅推送已启用")
# 提取 github_username 和 github_repo
smtp_conf = config["smtp"]
sender_email = smtp_conf["email"]
server = smtp_conf["server"]
port = smtp_conf["port"]
use_tls = smtp_conf["use_tls"]
password = os.getenv("SMTP_PWD")
# 获取 GitHub 仓库信息
fcl_repo = os.getenv('FCL_REPO')
if fcl_repo:
github_username, github_repo = fcl_repo.split('/')
else:
github_username = str(config["rss_subscribe"]["github_username"]).strip()
github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
# 输出 github_username 和 github_repo
logging.info("github_username: {github_username}".format(github_username=github_username))
logging.info("github_repo: {github_repo}".format(github_repo=github_repo))
logging.info(f"👤 GitHub 用户名:{github_username}")
logging.info(f"📁 GitHub 仓库:{github_repo}")
your_blog_url = config["rss_subscribe"]["your_blog_url"]
email_template = config["rss_subscribe"]["email_template"]
# 获取网站信息
website_title = config["rss_subscribe"]["website_info"]["title"]
# 获取最近更新的文章
latest_articles = get_latest_articles_from_link(
url=your_blog_url,
count=5,
last_articles_path="./rss_subscribe/last_articles.json"
)
logging.info("获取到的最新文章为:{latest_articles}".format(latest_articles=latest_articles))
if latest_articles == None:
logging.info("未进行推送的新文章")
if not latest_articles:
logging.info("📭 无新文章,无需推送")
else:
github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200"
logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
logging.info(f"🆕 获取到的最新文章:{latest_articles}")
github_api_url = (
f"https://api.github.com/repos/{github_username}/{github_repo}/issues"
f"?state=closed&label=subscribed&per_page=200"
)
logging.info(f"🔎 正在从 GitHub 获取订阅邮箱:{github_api_url}")
email_list = extract_emails_from_issues(github_api_url)
if email_list == None:
logging.info("无邮箱列表,请检查您的订阅列表是否有订阅者或订阅格式是否正确")
if not email_list:
logging.info("⚠️ 无订阅邮箱,请检查格式或是否有订阅者")
sys.exit(0)
else:
logging.info("获取到邮箱列表{email_list}".format(email_list=email_list))
# 循环latest_articles发送邮件
logging.info(f"📬 获取到邮箱列表:{email_list}")
for article in latest_articles:
template_data = {
"title": article["title"],
@ -97,17 +138,25 @@ if config["rss_subscribe"]["enable"]:
"published": article["published"],
"link": article["link"],
"website_title": website_title,
"github_issue_url": f"https://github.com/{github_username}/{github_repo}/issues?q=is%3Aissue+is%3Aclosed",
"github_issue_url": (
f"https://github.com/{github_username}/{github_repo}"
"/issues?q=is%3Aissue+is%3Aclosed"
),
}
send_emails(
emails=email_list["emails"],
sender_email=email,
sender_email=sender_email,
smtp_server=server,
port=port,
password=password,
subject= website_title + "の最新文章:" + article["title"],
body="文章链接:" + article["link"] + "\n" + "文章内容:" + article["summary"] + "\n" + "发布时间:" + article["published"],
subject=f"{website_title} の最新文章:{article['title']}",
body=(
f"📄 文章标题:{article['title']}\n"
f"🔗 链接:{article['link']}\n"
f"📝 简介:{article['summary']}\n"
f"🕒 发布时间:{article['published']}"
),
template_path=email_template,
template_data=template_data,
use_tls=use_tls