🧐添加特定RSS地址配置项,提高爬取成功率

This commit is contained in:
柳神 2024-09-03 21:32:32 +08:00
parent bd8e7f0ab5
commit 9b11eb4a2e
7 changed files with 424 additions and 2204 deletions

2174
all.json

File diff suppressed because it is too large Load Diff

View File

@ -50,3 +50,13 @@ smtp:
server: smtp.qq.com
port: 587
use_tls: true
# 特殊RSS地址指定
# 解释用于指定特殊RSS地址如B站专栏等不常见RSS地址后缀可以添加多个
# name: 友链名称
# url: 指定的RSS地址
specific_RSS:
- name: "Redish101"
url: "https://reblog.redish101.top/api/feed"
# - name: "無名小栈"
# url: "https://blog.imsyy.top/rss.xml"

View File

@ -1,63 +1,193 @@
[
[
"清羽飞扬",
"https://blog.liushen.fun/",
"https://blog.liushen.fun/info/avatar.ico"
],
[
"Akilar",
"https://akilar.top/",
"https://cdn.qyliu.top/i/2024/04/06/661170950f7a2.png"
],
[
"ChrisKim",
"https://www.zouht.com/",
"https://cdn.qyliu.top/i/2024/06/27/667d880789765.webp"
],
[
"张洪Heo",
"https://blog.zhheo.com/",
"https://cdn.qyliu.top/i/2024/08/04/66af2e22827be.webp"
],
[
"安知鱼",
"https://blog.anheyu.com/",
"https://cdn.qyliu.top/i/2024/04/06/66117080f2460.png"
],
[
"Leonus",
"https://blog.leonus.cn/",
"https://cdn.qyliu.top/i/2024/04/11/6617da4084197.png"
],
[
"Tianli",
"https://tianli-blog.club/",
"https://cdn.qyliu.top/i/2024/04/06/66116950412cc.png"
],
[
"星の野",
"https://byer.top/",
"https://cdn.qyliu.top/i/2024/08/04/66af2ed96cd7d.webp"
],
[
"一府",
"https://blog.duolaa.asia/",
"https://cdn.qyliu.top/i/2024/04/19/662145a134a68.png"
],
[
"JackieZhu",
"https://blog.zhfan.top/",
"https://cdn.qyliu.top/i/2024/06/04/665f170c6efde.webp"
],
[
"星の野",
"https://byer.top/",
"https://cdn.qyliu.top/i/2024/08/04/66af2ed96cd7d.webp"
],
[
"星港Star",
"https://blog.starsharbor.com",
"https://cdn.qyliu.top/i/2024/08/04/66af3198042a2.webp"
],
[
"满心记",
"https://blog.lovelu.top/",
"https://cdn.qyliu.top/i/2024/08/04/66af31bec75a0.webp"
],
[
"清风",
"https://luckqf.cn",
"https://cdn.qyliu.top/i/2024/04/07/661240e7c04f2.png"
],
[
"dreamChaser",
"https://blog.wenjing.xin/",
"https://cdn.qyliu.top/i/2024/03/22/65fc59b439430.png"
],
[
"揽星",
"https://lanxing.net/",
"https://cdn.qyliu.top/i/2024/04/07/6612410571110.png"
],
[
"微笔记",
"https://flytusky.top",
"https://cdn.qyliu.top/i/2024/04/06/66117022b57a5.png"
],
[
"百里飞洋",
"https://blog.meta-code.top/",
"https://cdn.qyliu.top/i/2024/08/16/66bef97a8b1ce.webp"
],
[
"June",
"https://blog.june-pj.cn/",
"https://cdn.qyliu.top/i/2024/03/22/65fc5b18a2574.png"
],
[
"阮一峰",
"https://www.ruanyifeng.com/blog/",
"https://cdn.qyliu.top/i/2024/04/29/662fbca75b9fe.png"
],
[
"Android",
"https://android99.com",
"https://cdn.qyliu.top/i/2024/04/07/66124120884cc.png"
"無名小栈",
"https://blog.imsyy.top/",
"https://cdn.qyliu.top/i/2024/03/21/65fc59764c0be.png"
],
[
"贰猹",
"https://noionion.top/",
"https://cdn.qyliu.top/i/2024/04/06/66116d6c6f18b.png"
],
[
"青桔气球",
"https://blog.qjqq.cn/",
"https://cdn.qyliu.top/i/2024/04/07/661241402f02a.png"
],
[
"Android",
"https://android99.com",
"https://cdn.qyliu.top/i/2024/04/07/66124120884cc.png"
],
[
"风记星辰",
"https://www.thyuu.com",
"https://cdn.qyliu.top/i/2024/08/04/66af2d2430f68.webp"
],
[
"Redish101",
"https://blog.redish101.top/",
"https://cdn.qyliu.top/i/2024/04/07/6612417f11b02.png"
],
[
"皮普",
"https://pipuwong.com",
"https://cdn.qyliu.top/i/2024/04/09/6614f1b29c34a.png"
"呓语梦轩",
"https://blog.awaae001.top",
"https://cdn.qyliu.top/i/2024/03/30/660769346d538.jpg"
],
[
"Fgaoxing",
"https://www.yt-blog.top/",
"https://cdn.qyliu.top/i/2024/04/06/66116a9cd62d7.png"
],
[
"山岳库博",
"https://kmar.top/",
"https://cdn.qyliu.top/i/2024/04/06/66116c3ba58c3.png"
],
[
"微霞",
"https://yuuu.org",
"https://cdn.qyliu.top/i/2024/04/06/66116f0fe045d.png"
],
[
"理随",
"https://lisui.top/",
"https://cdn.qyliu.top/i/2024/04/07/66123e6aac11c.png"
],
[
"爱吃肉的猫",
"https://meuicat.com/",
"https://cdn.qyliu.top/i/2024/04/07/66123cff10a83.png"
],
[
"乙未博客",
"https://www.yvii.cn",
"https://cdn.qyliu.top/i/2024/04/09/6614f05e2f75c.png"
],
[
"皮普",
"https://pipuwong.com",
"https://cdn.qyliu.top/i/2024/04/09/6614f1b29c34a.png"
],
[
"GuKaifeng",
"https://gukaifeng.cn/",
"https://cdn.qyliu.top/i/2024/04/09/6614ef03406cc.png"
],
[
"Efu",
"https://blog.everfu.org/",
"https://cdn.qyliu.top/i/2024/04/26/662bcda2afd5d.png"
"东评西就",
"https://dongjunke.cn",
"https://cdn.qyliu.top/i/2024/04/09/6614f26979229.png"
],
[
"obaby",
"https://h4ck.org.cn/",
"https://cdn.qyliu.top/i/2024/04/15/661d443b5359c.png"
"鹊楠",
"https://www.quenan.love",
"https://cdn.qyliu.top/i/2024/04/10/66164fd0e92b6.png"
],
[
"Ariasaka",
"https://blog.yaria.top/",
"https://cdn.qyliu.top/i/2024/04/09/6614f40b65114.png"
],
[
"唐志远",
"https://fe32.top/",
"https://cdn.qyliu.top/i/2024/08/04/66af324b5e627.webp"
],
[
"往日信笺",
@ -65,19 +195,94 @@
"https://cdn.qyliu.top/i/2024/04/14/661bf476a6d8d.png"
],
[
"轻笑",
"https://www.qcqx.cn/",
"https://cdn.qyliu.top/i/2024/05/31/6659628eddf20.png"
"半方池水",
"https://uuanqin.top/",
"https://cdn.qyliu.top/i/2024/04/14/661bf5b9d3d5c.png"
],
[
"Dreamaker",
"https://dreamakerr.cn/",
"https://cdn.qyliu.top/i/2024/06/05/66604a6f8dba9.webp"
"星空故事",
"https://blog.sinzmise.top/",
"https://cdn.qyliu.top/i/2024/04/24/6628f767880fa.png"
],
[
"雾林博客",
"https://www.baiwulin.com/",
"https://cdn.qyliu.top/i/2024/08/02/66ac3b75826cb.webp"
"湘铭",
"https://xiangming.site/",
"https://cdn.qyliu.top/i/2024/04/19/6622928fc416c.png"
],
[
"Fiveth",
"https://blog.fiveth.cc/",
"https://cdn.qyliu.top/i/2024/04/20/6623d64f16aee.png"
],
[
"obaby",
"https://h4ck.org.cn/",
"https://cdn.qyliu.top/i/2024/04/15/661d443b5359c.png"
],
[
"刘郎阁",
"https://yjvc.cn/index.php/",
"https://cdn.qyliu.top/i/2024/04/19/66229406a68df.png"
],
[
"茗辰原",
"https://not.liyy.us.kg/",
"https://cdn.qyliu.top/i/2024/04/21/66249e0a775f8.png"
],
[
"青竹小轩",
"https://gyhwd.top/",
"https://cdn.qyliu.top/i/2024/04/23/662748990165a.png"
],
[
"Efu",
"https://blog.everfu.org/",
"https://cdn.qyliu.top/i/2024/04/26/662bcda2afd5d.png"
],
[
"辞琼",
"https://blog.wsq127.top/",
"https://cdn.qyliu.top/i/2024/04/27/662d1c9b7efe1.png"
],
[
"星辰",
"https://blog.6ing.top/",
"https://cdn.qyliu.top/i/2024/04/27/662d1d8995fe2.png"
],
[
"GanSer",
"https://www.gan1ser.top/",
"https://cdn.qyliu.top/i/2024/04/29/662fbba91addc.png"
],
[
"蛋蛋困了",
"https://blog.wzwzx.cn/",
"https://cdn.qyliu.top/i/2024/04/30/6631024628e0d.png"
],
[
"葱苓sama",
"https://blog.ciraos.top/",
"https://cdn.qyliu.top/i/2024/05/03/663458df4e6f0.png"
],
[
"微生之最",
"https://www.swszz.cn/",
"https://cdn.qyliu.top/i/2024/05/04/66351f5b62b65.png"
],
[
"HiPeach",
"https://blog.opeach.cn",
"https://cdn.qyliu.top/i/2024/05/04/6635d30e5429b.png"
],
[
"陌颜Hao",
"https://blog.imoyan.top/",
"https://cdn.qyliu.top/i/2024/08/04/66af3318f1d1c.webp"
],
[
"XINGYE",
"https://blog.xing-ye.top/",
"https://cdn.qyliu.top/i/2024/05/18/6648b84c682e1.png"
],
[
"M.Talen",
@ -85,13 +290,148 @@
"https://cdn.qyliu.top/i/2024/05/23/664eda97bdec6.png"
],
[
"LinuxWin",
"https://meoblog.pages.dev/",
"https://cdn.qyliu.top/i/2024/08/31/66d32bc0384f8.webp"
"凉心",
"https://www.lxink.cn/",
"https://cdn.qyliu.top/i/2024/05/24/66500d797ac38.png"
],
[
"轻笑",
"https://www.qcqx.cn/",
"https://cdn.qyliu.top/i/2024/05/31/6659628eddf20.png"
],
[
"虹墨",
"https://www.imaegoo.com/",
"https://cdn.qyliu.top/i/2024/05/31/6659668724466.png"
],
[
"痕迹小站",
"https://www.henjinet.com/",
"https://cdn.qyliu.top/i/2024/05/31/6659688bbf14b.png"
],
[
"六月是只猫",
"https://www.lyszm.com/",
"https://cdn.qyliu.top/i/2024/05/31/66597328bdd51.png"
],
[
"Dreamaker",
"https://dreamakerr.cn/",
"https://cdn.qyliu.top/i/2024/06/05/66604a6f8dba9.webp"
],
[
"纸鹿本鹿",
"https://blog.zhilu.cyou",
"https://cdn.qyliu.top/i/2024/06/02/665c869ded7cd.png"
],
[
"ZHI-BLOG",
"https://blog.zhwei101.com/",
"https://cdn.qyliu.top/i/2024/06/01/665a9a8f8418a.png"
],
[
"SerMs",
"https://blog.serms.top/",
"https://cdn.qyliu.top/i/2024/06/23/6678094c8d38d.webp"
],
[
"凌云",
"https://www.linyunlink.top/",
"https://cdn.qyliu.top/i/2024/06/25/667a6fc1f346a.webp"
],
[
"BUZZ",
"https://blog.buzzchat.top/",
"https://cdn.qyliu.top/i/2024/06/27/667d87ae6abf9.webp"
],
[
"未月拾叁",
"https://tsukiyo.cn/",
"https://cdn.qyliu.top/i/2024/07/06/6688f29e7288d.webp"
],
[
"Hitagi",
"https://www.hitagi.icu/",
"https://cdn.qyliu.top/i/2024/07/07/668a9e330dc6a.webp"
],
[
"FloatSheep",
"https://blog.hesiy.cn/posts/",
"https://cdn.qyliu.top/i/2024/07/28/66a6324891ba1.webp"
],
[
"雾林博客",
"https://www.baiwulin.com/",
"https://cdn.qyliu.top/i/2024/08/02/66ac3b75826cb.webp"
],
[
"GZZZ",
"https://blog.gzzz.pro/",
"https://cdn.qyliu.top/i/2024/08/05/66b07b620cd9e.webp"
],
[
"Chlorine",
"https://www.yoghurtlee.com/",
"https://cdn.qyliu.top/i/2024/08/07/66b2daf9ab79f.webp"
],
[
"听风小屋",
"https://blog.ifeng.asia/",
"https://cdn.qyliu.top/i/2024/03/31/6608e2697634c.png"
],
[
"梦爱吃鱼",
"https://blog.bsgun.cn/",
"https://cdn.qyliu.top/i/2024/08/02/66acfd1f993ce.webp"
],
[
"安小歪blog",
"https://hexo.shangskr.top",
"https://cdn.qyliu.top/i/2024/08/18/66c0dbddc587a.webp"
],
[
"小陆yaの博客",
"https://xlwlgzs.com/",
"https://cdn.qyliu.top/i/2024/08/18/66c1b740ad40d.webp"
],
[
"朽丘博",
"https://koxiuqiu.cn",
"https://cdn.qyliu.top/i/2024/08/20/66c41c9c45e2a.webp"
],
[
"星辰日记",
"https://blog.xsot.cn/",
"https://cdn.qyliu.top/i/2024/05/31/66595f03d46be.png"
],
[
"LinuxWin",
"https://meoblog.pages.dev/",
"https://cdn.qyliu.top/i/2024/08/31/66d32bc0384f8.webp"
],
[
"杜老师说",
"https://dusays.com",
"https://cdn.qyliu.top/i/2024/08/04/66af33a4d6643.webp"
],
[
"luo",
"https://www.bokelhc.cn",
"https://cdn.qyliu.top/i/2024/08/28/66ced0341401e.webp"
],
[
"幻雪博客",
"https://huanxueblog.top/",
"https://cdn.qyliu.top/i/2024/03/31/66090bfe06ba6.png"
],
[
"365云栈",
"https://blog.365sites.top/",
"https://cdn.qyliu.top/i/2024/08/02/66acf256da797.webp"
],
[
"AlenLiu",
"https://blog.alenliu.space/",
"https://cdn.qyliu.top/i/2024/03/31/66097a6d9363f.png"
]
]

View File

@ -178,7 +178,7 @@ def parse_feed(url, session, count=5):
'articles': []
}
def process_friend(friend, session, count):
def process_friend(friend, session, count, specific_RSS=[]):
"""
处理单个朋友的博客信息
@ -186,13 +186,24 @@ def process_friend(friend, session, count):
friend (list): 包含朋友信息的列表 [name, blog_url, avatar]
session (requests.Session): 用于请求的会话对象
count (int): 获取每个博客的最大文章数
specific_RSS (list): 包含特定 RSS 源的字典列表 [{name, url}]
返回
dict: 包含朋友博客信息的字典
"""
name, blog_url, avatar = friend
feed_type, feed_url = check_feed(blog_url, session)
print(f"========“{name}”的博客“{blog_url}”的feed类型为“{feed_type}”========")
# 如果 specific_RSS 中有对应的 name则直接返回 feed_url
if specific_RSS is None:
specific_RSS = []
rss_feed = next((rss['url'] for rss in specific_RSS if rss['name'] == name), None)
if rss_feed:
feed_url = rss_feed
feed_type = 'specific'
print(f"========“{name}”的博客“{blog_url}”为特定RSS源“{feed_url}”========")
else:
feed_type, feed_url = check_feed(blog_url, session)
print(f"========“{name}”的博客“{blog_url}”的feed类型为“{feed_type}”========")
if feed_type != 'none':
feed_info = parse_feed(feed_url, session, count)
@ -223,13 +234,14 @@ def process_friend(friend, session, count):
'articles': []
}
def fetch_and_process_data(json_url, count=5):
def fetch_and_process_data(json_url, specific_RSS=[], count=5):
"""
读取 JSON 数据并处理订阅信息返回统计数据和文章信息
参数
json_url (str): 包含朋友信息的 JSON 文件的 URL
count (int): 获取每个博客的最大文章数
specific_RSS (list): 包含特定 RSS 源的字典列表 [{name, url}]
返回
dict: 包含统计数据和文章信息的字典
@ -252,7 +264,7 @@ def fetch_and_process_data(json_url, count=5):
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_friend = {
executor.submit(process_friend, friend, session, count): friend
executor.submit(process_friend, friend, session, count, specific_RSS): friend
for friend in friends_data['friends']
}

View File

@ -10,6 +10,11 @@
## 开发进度
### 2024-09-03
* 添加特定RSS选项用于指定部分友链特殊RSS地址
* 更新文档添加特定RSS选项配置部分
### 2024-08-28
* 日常维护修复issue中提出的时间为空导致错误的情况使用更新时间代替
@ -192,6 +197,22 @@
这部分配置较为复杂,请自行学习使用。
- ** 特定RSS配置 **
用于指定特定友链特殊RSS样例如下
```yaml
specific_RSS:
- name: "Redish101"
url: "https://reblog.redish101.top/api/feed"
# - name: "無名小栈"
# url: "https://blog.imsyy.top/rss.xml"
```
`name`:友链名称,需要严格匹配
`url`该友链对应RSS地址
可以添加多个,如果不需要也可以置空。
2. **贡献与定制:**
欢迎对仓库进行贡献或根据需要进行定制。

3
run.py
View File

@ -14,8 +14,9 @@ if config["spider_settings"]["enable"]:
print("爬虫已启用")
json_url = config['spider_settings']['json_url']
article_count = config['spider_settings']['article_count']
specific_RSS = config['specific_RSS']
print("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
result, lost_friends = fetch_and_process_data(json_url=json_url, count=article_count)
result, lost_friends = fetch_and_process_data(json_url=json_url, specific_RSS=specific_RSS, count=article_count)
if config["spider_settings"]["merge_result"]["enable"]:
marge_json_url = config['spider_settings']["merge_result"]['merge_json_url']
print("合并数据功能开启,从 {marge_json_url} 中获取境外数据并合并".format(marge_json_url=marge_json_url + "/all.json"))