🤗更加合理的请求头
This commit is contained in:
parent
ec4328cbe0
commit
5b47d2d3bc
@ -1,14 +1,36 @@
|
|||||||
import logging
|
import logging
|
||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
import requests
|
import requests
|
||||||
import re
|
|
||||||
import feedparser
|
import feedparser
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
# 标准化的请求头
|
# 标准化的请求头
|
||||||
headers = {
|
HEADERS_JSON = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/123.0.0.0 Safari/537.36 "
|
||||||
|
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
|
||||||
|
),
|
||||||
|
"X-Friend-Circle": "1.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
HEADERS_XML = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/123.0.0.0 Safari/537.36 "
|
||||||
|
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
|
||||||
|
),
|
||||||
|
"Accept": "application/rss+xml, application/xml;q=0.9, */*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Friend-Circle": "1.0"
|
||||||
}
|
}
|
||||||
|
|
||||||
timeout = (10, 15) # 连接超时和读取超时,防止requests接受时间过长
|
timeout = (10, 15) # 连接超时和读取超时,防止requests接受时间过长
|
||||||
@ -86,7 +108,7 @@ def check_feed(blog_url, session):
|
|||||||
for feed_type, path in possible_feeds:
|
for feed_type, path in possible_feeds:
|
||||||
feed_url = blog_url.rstrip('/') + path
|
feed_url = blog_url.rstrip('/') + path
|
||||||
try:
|
try:
|
||||||
response = session.get(feed_url, headers=headers, timeout=timeout)
|
response = session.get(feed_url, headers=HEADERS_XML, timeout=timeout)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return [feed_type, feed_url]
|
return [feed_type, feed_url]
|
||||||
except requests.RequestException:
|
except requests.RequestException:
|
||||||
@ -111,14 +133,14 @@ def parse_feed(url, session, count=5, blog_url=''):
|
|||||||
dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
|
dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = session.get(url, headers=headers, timeout=timeout)
|
response = session.get(url, headers=HEADERS_XML, timeout=timeout)
|
||||||
response.encoding = response.apparent_encoding or 'utf-8'
|
response.encoding = response.apparent_encoding or 'utf-8'
|
||||||
feed = feedparser.parse(response.text)
|
feed = feedparser.parse(response.text)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
'website_name': feed.feed.title if 'title' in feed.feed else '',
|
'website_name': feed.feed.title if 'title' in feed.feed else '', # type: ignore
|
||||||
'author': feed.feed.author if 'author' in feed.feed else '',
|
'author': feed.feed.author if 'author' in feed.feed else '', # type: ignore
|
||||||
'link': feed.feed.link if 'link' in feed.feed else '',
|
'link': feed.feed.link if 'link' in feed.feed else '', # type: ignore
|
||||||
'articles': []
|
'articles': []
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +157,7 @@ def parse_feed(url, session, count=5, blog_url=''):
|
|||||||
logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
|
logging.warning(f"文章 {entry.title} 未包含任何时间信息, 请检查原文, 设置为默认时间")
|
||||||
|
|
||||||
# 处理链接中可能存在的错误,比如ip或localhost
|
# 处理链接中可能存在的错误,比如ip或localhost
|
||||||
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else ''
|
article_link = replace_non_domain(entry.link, blog_url) if 'link' in entry else '' # type: ignore
|
||||||
|
|
||||||
article = {
|
article = {
|
||||||
'title': entry.title if 'title' in entry else '',
|
'title': entry.title if 'title' in entry else '',
|
||||||
@ -177,7 +199,19 @@ def replace_non_domain(link: str, blog_url: str) -> str:
|
|||||||
# path = re.sub(r'^https?://[^/]+', '', link)
|
# path = re.sub(r'^https?://[^/]+', '', link)
|
||||||
# print(path)
|
# print(path)
|
||||||
|
|
||||||
return link
|
try:
|
||||||
|
parsed = urlparse(link)
|
||||||
|
if 'localhost' in parsed.netloc or re.match(r'^\d{1,3}(\.\d{1,3}){3}$', parsed.netloc): # IP地址或localhost
|
||||||
|
# 提取 path + query
|
||||||
|
path = parsed.path or '/'
|
||||||
|
if parsed.query:
|
||||||
|
path += '?' + parsed.query
|
||||||
|
return urljoin(blog_url.rstrip('/') + '/', path.lstrip('/'))
|
||||||
|
else:
|
||||||
|
return link # 合法域名则返回原链接
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"替换链接时出错:{link}, error: {e}")
|
||||||
|
return link
|
||||||
|
|
||||||
def process_friend(friend, session, count, specific_RSS=[]):
|
def process_friend(friend, session, count, specific_RSS=[]):
|
||||||
"""
|
"""
|
||||||
@ -250,7 +284,7 @@ def fetch_and_process_data(json_url, specific_RSS=[], count=5):
|
|||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = session.get(json_url, headers=headers, timeout=timeout)
|
response = session.get(json_url, headers=HEADERS_JSON, timeout=timeout)
|
||||||
friends_data = response.json()
|
friends_data = response.json()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"无法获取链接:{json_url} :{e}", exc_info=True)
|
logging.error(f"无法获取链接:{json_url} :{e}", exc_info=True)
|
||||||
@ -338,7 +372,7 @@ def marge_data_from_json_url(data, marge_json_url):
|
|||||||
dict: 合并后的文章信息字典,已去重处理
|
dict: 合并后的文章信息字典,已去重处理
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = requests.get(marge_json_url, headers=headers, timeout=timeout)
|
response = requests.get(marge_json_url, headers=HEADERS_JSON, timeout=timeout)
|
||||||
marge_data = response.json()
|
marge_data = response.json()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"无法获取链接:{marge_json_url},出现的问题为:{e}", exc_info=True)
|
logging.error(f"无法获取链接:{marge_json_url},出现的问题为:{e}", exc_info=True)
|
||||||
|
@ -6,10 +6,17 @@ import json
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# 标准化的请求头
|
# 标准化的请求头
|
||||||
headers = {
|
HEADERS_JSON = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/123.0.0.0 Safari/537.36 "
|
||||||
|
"(Friend-Circle-Lite/1.0; +https://github.com/willow-god/Friend-Circle-Lite)"
|
||||||
|
),
|
||||||
|
"X-Friend-Circle": "1.0"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_emails_from_issues(api_url):
|
def extract_emails_from_issues(api_url):
|
||||||
"""
|
"""
|
||||||
从GitHub issues API中提取以[e-mail]开头的title中的邮箱地址。
|
从GitHub issues API中提取以[e-mail]开头的title中的邮箱地址。
|
||||||
@ -26,7 +33,7 @@ def extract_emails_from_issues(api_url):
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = requests.get(api_url, headers=headers)
|
response = requests.get(api_url, headers=HEADERS_JSON, timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
issues = response.json()
|
issues = response.json()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
165
run.py
165
run.py
@ -1,95 +1,136 @@
|
|||||||
# 引入 check_feed 和 parse_feed 函数
|
|
||||||
from friend_circle_lite.get_info import fetch_and_process_data, sort_articles_by_time, marge_data_from_json_url, marge_errors_from_json_url, deal_with_large_data
|
|
||||||
from friend_circle_lite.get_conf import load_config
|
|
||||||
from rss_subscribe.push_article_update import get_latest_articles_from_link, extract_emails_from_issues
|
|
||||||
from push_rss_update.send_email import send_emails
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# 日志记录
|
from friend_circle_lite.get_info import (
|
||||||
logging.basicConfig(level=logging.INFO, format='😋 %(levelname)s: %(message)s')
|
fetch_and_process_data,
|
||||||
|
marge_data_from_json_url,
|
||||||
|
marge_errors_from_json_url,
|
||||||
|
deal_with_large_data
|
||||||
|
)
|
||||||
|
from friend_circle_lite.get_conf import load_config
|
||||||
|
from rss_subscribe.push_article_update import (
|
||||||
|
get_latest_articles_from_link,
|
||||||
|
extract_emails_from_issues
|
||||||
|
)
|
||||||
|
from push_rss_update.send_email import send_emails
|
||||||
|
|
||||||
|
# ========== 日志设置 ==========
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='😋 %(levelname)s: %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
# 爬虫部分内容
|
# ========== 加载配置 ==========
|
||||||
config = load_config("./conf.yaml")
|
config = load_config("./conf.yaml")
|
||||||
|
|
||||||
|
# ========== 爬虫模块 ==========
|
||||||
if config["spider_settings"]["enable"]:
|
if config["spider_settings"]["enable"]:
|
||||||
logging.info("爬虫已启用")
|
logging.info("✅ 爬虫已启用")
|
||||||
|
|
||||||
json_url = config['spider_settings']['json_url']
|
json_url = config['spider_settings']['json_url']
|
||||||
article_count = config['spider_settings']['article_count']
|
article_count = config['spider_settings']['article_count']
|
||||||
specific_RSS = config['specific_RSS']
|
specific_rss = config['specific_RSS']
|
||||||
logging.info("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
|
|
||||||
result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
|
logging.info(f"📥 正在从 {json_url} 获取数据,每个博客获取 {article_count} 篇文章")
|
||||||
|
result, lost_friends = fetch_and_process_data(
|
||||||
|
json_url=json_url,
|
||||||
|
specific_RSS=specific_rss,
|
||||||
|
count=article_count
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
if config["spider_settings"]["merge_result"]["enable"]:
|
if config["spider_settings"]["merge_result"]["enable"]:
|
||||||
marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
|
merge_url = config['spider_settings']["merge_result"]['merge_json_url']
|
||||||
logging.info("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))
|
logging.info(f"🔀 合并功能开启,从 {merge_url} 获取外部数据")
|
||||||
result = marge_data_from_json_url(result, marge_json_url + "/all.json")
|
|
||||||
lost_friends = marge_errors_from_json_url(lost_friends, marge_json_url + "/errors.json")
|
result = marge_data_from_json_url(result, f"{merge_url}/all.json")
|
||||||
logging.info("数据获取完毕,目前共有 {count} 位好友的动态,正在处理数据".format(count=len(result.get("article_data", []))))
|
lost_friends = marge_errors_from_json_url(lost_friends, f"{merge_url}/errors.json")
|
||||||
|
|
||||||
|
article_count = len(result.get("article_data", []))
|
||||||
|
logging.info(f"📦 数据获取完毕,共有 {article_count} 位好友的动态,正在处理数据")
|
||||||
|
|
||||||
result = deal_with_large_data(result)
|
result = deal_with_large_data(result)
|
||||||
|
|
||||||
with open("all.json", "w", encoding="utf-8") as f:
|
with open("all.json", "w", encoding="utf-8") as f:
|
||||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
with open("errors.json", "w", encoding="utf-8") as f:
|
with open("errors.json", "w", encoding="utf-8") as f:
|
||||||
json.dump(lost_friends, f, ensure_ascii=False, indent=2)
|
json.dump(lost_friends, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
# ========== 推送准备 ==========
|
||||||
if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
|
if config["email_push"]["enable"] or config["rss_subscribe"]["enable"]:
|
||||||
logging.info("推送功能已启用,正在准备推送,获取配置信息")
|
logging.info("📨 推送功能已启用,正在准备中...")
|
||||||
email_settings = config["smtp"]
|
|
||||||
email = email_settings["email"]
|
smtp_conf = config["smtp"]
|
||||||
server = email_settings["server"]
|
sender_email = smtp_conf["email"]
|
||||||
port = email_settings["port"]
|
server = smtp_conf["server"]
|
||||||
use_tls = email_settings["use_tls"]
|
port = smtp_conf["port"]
|
||||||
|
use_tls = smtp_conf["use_tls"]
|
||||||
password = os.getenv("SMTP_PWD")
|
password = os.getenv("SMTP_PWD")
|
||||||
logging.info("SMTP 服务器信息:{server}:{port}".format(server=server, port=port))
|
|
||||||
logging.info("密码:{pwd}************".format(pwd=password[:3]))
|
|
||||||
|
|
||||||
|
logging.info(f"📡 SMTP 服务器:{server}:{port}")
|
||||||
|
if not password:
|
||||||
|
logging.error("❌ 环境变量 SMTP_PWD 未设置,无法发送邮件")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
logging.info(f"🔐 密码(部分):{password[:3]}*****")
|
||||||
|
|
||||||
|
# ========== 邮件推送(待实现)==========
|
||||||
if config["email_push"]["enable"]:
|
if config["email_push"]["enable"]:
|
||||||
logging.info("邮件推送已启用")
|
logging.info("📧 邮件推送已启用")
|
||||||
logging.info("抱歉,目前暂未实现功能")
|
logging.info("⚠️ 抱歉,目前尚未实现邮件推送功能")
|
||||||
|
|
||||||
if config["rss_subscribe"]["enable"]:
|
|
||||||
logging.info("RSS 订阅推送已启用")
|
|
||||||
# 获取并强制转换为字符串
|
|
||||||
# 尝试从环境变量获取 FCL_REPO
|
|
||||||
fcl_repo = os.getenv('FCL_REPO')
|
|
||||||
|
|
||||||
# 提取 github_username 和 github_repo
|
# ========== RSS 订阅推送 ==========
|
||||||
|
if config["rss_subscribe"]["enable"]:
|
||||||
|
logging.info("📰 RSS 订阅推送已启用")
|
||||||
|
|
||||||
|
smtp_conf = config["smtp"]
|
||||||
|
sender_email = smtp_conf["email"]
|
||||||
|
server = smtp_conf["server"]
|
||||||
|
port = smtp_conf["port"]
|
||||||
|
use_tls = smtp_conf["use_tls"]
|
||||||
|
password = os.getenv("SMTP_PWD")
|
||||||
|
|
||||||
|
# 获取 GitHub 仓库信息
|
||||||
|
fcl_repo = os.getenv('FCL_REPO')
|
||||||
if fcl_repo:
|
if fcl_repo:
|
||||||
github_username, github_repo = fcl_repo.split('/')
|
github_username, github_repo = fcl_repo.split('/')
|
||||||
else:
|
else:
|
||||||
github_username = str(config["rss_subscribe"]["github_username"]).strip()
|
github_username = str(config["rss_subscribe"]["github_username"]).strip()
|
||||||
github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
|
github_repo = str(config["rss_subscribe"]["github_repo"]).strip()
|
||||||
|
|
||||||
# 输出 github_username 和 github_repo
|
logging.info(f"👤 GitHub 用户名:{github_username}")
|
||||||
logging.info("github_username: {github_username}".format(github_username=github_username))
|
logging.info(f"📁 GitHub 仓库:{github_repo}")
|
||||||
logging.info("github_repo: {github_repo}".format(github_repo=github_repo))
|
|
||||||
|
|
||||||
your_blog_url = config["rss_subscribe"]["your_blog_url"]
|
your_blog_url = config["rss_subscribe"]["your_blog_url"]
|
||||||
email_template = config["rss_subscribe"]["email_template"]
|
email_template = config["rss_subscribe"]["email_template"]
|
||||||
# 获取网站信息
|
|
||||||
website_title = config["rss_subscribe"]["website_info"]["title"]
|
website_title = config["rss_subscribe"]["website_info"]["title"]
|
||||||
# 获取最近更新的文章
|
|
||||||
latest_articles = get_latest_articles_from_link(
|
latest_articles = get_latest_articles_from_link(
|
||||||
url=your_blog_url,
|
url=your_blog_url,
|
||||||
count=5,
|
count=5,
|
||||||
last_articles_path="./rss_subscribe/last_articles.json"
|
last_articles_path="./rss_subscribe/last_articles.json"
|
||||||
)
|
)
|
||||||
logging.info("获取到的最新文章为:{latest_articles}".format(latest_articles=latest_articles))
|
|
||||||
if latest_articles == None:
|
if not latest_articles:
|
||||||
logging.info("无未进行推送的新文章")
|
logging.info("📭 无新文章,无需推送")
|
||||||
else:
|
else:
|
||||||
github_api_url = "https://api.github.com/repos/" + github_username + "/" + github_repo + "/issues" + "?state=closed&label=subscribed&per_page=200"
|
logging.info(f"🆕 获取到的最新文章:{latest_articles}")
|
||||||
logging.info("正在从 {github_api_url} 中获取订阅信息".format(github_api_url=github_api_url))
|
|
||||||
|
github_api_url = (
|
||||||
|
f"https://api.github.com/repos/{github_username}/{github_repo}/issues"
|
||||||
|
f"?state=closed&label=subscribed&per_page=200"
|
||||||
|
)
|
||||||
|
logging.info(f"🔎 正在从 GitHub 获取订阅邮箱:{github_api_url}")
|
||||||
email_list = extract_emails_from_issues(github_api_url)
|
email_list = extract_emails_from_issues(github_api_url)
|
||||||
if email_list == None:
|
|
||||||
logging.info("无邮箱列表,请检查您的订阅列表是否有订阅者或订阅格式是否正确")
|
if not email_list:
|
||||||
|
logging.info("⚠️ 无订阅邮箱,请检查格式或是否有订阅者")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
else:
|
|
||||||
logging.info("获取到的邮箱列表为:{email_list}".format(email_list=email_list))
|
logging.info(f"📬 获取到邮箱列表:{email_list}")
|
||||||
# 循环latest_articles,发送邮件
|
|
||||||
for article in latest_articles:
|
for article in latest_articles:
|
||||||
template_data = {
|
template_data = {
|
||||||
"title": article["title"],
|
"title": article["title"],
|
||||||
@ -97,17 +138,25 @@ if config["rss_subscribe"]["enable"]:
|
|||||||
"published": article["published"],
|
"published": article["published"],
|
||||||
"link": article["link"],
|
"link": article["link"],
|
||||||
"website_title": website_title,
|
"website_title": website_title,
|
||||||
"github_issue_url": f"https://github.com/{github_username}/{github_repo}/issues?q=is%3Aissue+is%3Aclosed",
|
"github_issue_url": (
|
||||||
|
f"https://github.com/{github_username}/{github_repo}"
|
||||||
|
"/issues?q=is%3Aissue+is%3Aclosed"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
send_emails(
|
send_emails(
|
||||||
emails=email_list["emails"],
|
emails=email_list["emails"],
|
||||||
sender_email=email,
|
sender_email=sender_email,
|
||||||
smtp_server=server,
|
smtp_server=server,
|
||||||
port=port,
|
port=port,
|
||||||
password=password,
|
password=password,
|
||||||
subject= website_title + "の最新文章:" + article["title"],
|
subject=f"{website_title} の最新文章:{article['title']}",
|
||||||
body="文章链接:" + article["link"] + "\n" + "文章内容:" + article["summary"] + "\n" + "发布时间:" + article["published"],
|
body=(
|
||||||
|
f"📄 文章标题:{article['title']}\n"
|
||||||
|
f"🔗 链接:{article['link']}\n"
|
||||||
|
f"📝 简介:{article['summary']}\n"
|
||||||
|
f"🕒 发布时间:{article['published']}"
|
||||||
|
),
|
||||||
template_path=email_template,
|
template_path=email_template,
|
||||||
template_data=template_data,
|
template_data=template_data,
|
||||||
use_tls=use_tls
|
use_tls=use_tls
|
||||||
|
Loading…
x
Reference in New Issue
Block a user