😥测试action是否正常运行
This commit is contained in:
		
							
								
								
									
										40
									
								
								.github/workflows/friend_circle_lite.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								.github/workflows/friend_circle_lite.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,40 @@
 | 
			
		||||
name: RSS Check
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  schedule:
 | 
			
		||||
    - cron: "0 22 */2 * *"
 | 
			
		||||
  workflow_dispatch:
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  check_rss:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: Checkout code
 | 
			
		||||
      uses: actions/checkout@v3
 | 
			
		||||
    
 | 
			
		||||
    - name: Set up Python
 | 
			
		||||
      uses: actions/setup-python@v4
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: '3.x'
 | 
			
		||||
        
 | 
			
		||||
    - name: Install dependencies
 | 
			
		||||
      run: |
 | 
			
		||||
        python -m pip install --upgrade pip
 | 
			
		||||
        pip install -r requirements.txt
 | 
			
		||||
        
 | 
			
		||||
    - name: Check RSS feeds
 | 
			
		||||
      run: python run.py
 | 
			
		||||
        
 | 
			
		||||
    - name: git config
 | 
			
		||||
      run: |
 | 
			
		||||
        git config --global user.name 'GitHub Actions'
 | 
			
		||||
        git config --global user.email 'actions@github.com'
 | 
			
		||||
 | 
			
		||||
    - name: Commit changes
 | 
			
		||||
      env:
 | 
			
		||||
        PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
 | 
			
		||||
      run: |
 | 
			
		||||
        git add .
 | 
			
		||||
        git commit -m "⏱️GitHub Action定时更新"
 | 
			
		||||
        git push https://x-access-token:${{ secrets.PAT_TOKEN }}@github.com/${{ github.repository }}.git main
 | 
			
		||||
							
								
								
									
										6
									
								
								conf.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								conf.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
			
		||||
# 爬虫相关配置
 | 
			
		||||
spider_settings:
 | 
			
		||||
  enable: true                                    # 是否启用爬虫
 | 
			
		||||
  json_url: "https://blog.qyliu.top/friend.json"  # 请填写对应格式json的地址,仅支持网络地址
 | 
			
		||||
  article_count: 5                                # 请填写每个博客需要获取的最大文章数量
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										227
									
								
								dev_test/main.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										227
									
								
								dev_test/main.ipynb
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,227 @@
 | 
			
		||||
{
 | 
			
		||||
 "cells": [
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 1,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "正在检查 清羽飞扬 的博客 https://blog.qyliu.top/\n",
 | 
			
		||||
      "正在检查 张洪Heo 的博客 https://blog.zhheo.com/\n",
 | 
			
		||||
      "正在检查 Leonus 的博客 https://blog.leonus.cn/\n",
 | 
			
		||||
      "正在检查 杜老师说 的博客 https://dusays.com\n",
 | 
			
		||||
      "一府 的博客 https://blog.duolaa.asia/ 无法访问\n",
 | 
			
		||||
      "正在检查 贰猹 的博客 https://noionion.top/\n",
 | 
			
		||||
      "正在检查 ChrisKim 的博客 https://www.zouht.com/\n",
 | 
			
		||||
      "正在检查 無名小栈 的博客 https://blog.imsyy.top/\n",
 | 
			
		||||
      "正在检查 满心记 的博客 https://blog.lovelu.top/\n",
 | 
			
		||||
      "正在检查 Tianli 的博客 https://tianli-blog.club/\n",
 | 
			
		||||
      "不可链接的FEED地址:https://blog.imsyy.top/atom.xml: HTTPSConnectionPool(host='blog.imsyy.top', port=443): Max retries exceeded with url: /404 (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response')))\n",
 | 
			
		||||
      "正在检查 Akilar 的博客 https://akilar.top/\n",
 | 
			
		||||
      "正在检查 星港Star 的博客 https://blog.starsharbor.com\n",
 | 
			
		||||
      "清风 的博客 https://luckqf.cn 无法访问\n",
 | 
			
		||||
      "dreamChaser 的博客 https://blog.wenjing.xin/ 无法访问\n",
 | 
			
		||||
      "正在检查 百里飞洋 的博客 https://blog.meta-code.top/\n",
 | 
			
		||||
      "正在检查 揽星 的博客 https://lanxing.net/\n",
 | 
			
		||||
      "正在检查 Android 的博客 https://android99.com\n",
 | 
			
		||||
      "正在检查 阮一峰 的博客 https://www.ruanyifeng.com/blog/\n",
 | 
			
		||||
      "正在检查 星辰日记 的博客 https://blog.xsot.cn/\n",
 | 
			
		||||
      "正在检查 星の野 的博客 https://byer.top/\n",
 | 
			
		||||
      "正在检查 June 的博客 https://blog.june-pj.cn/\n",
 | 
			
		||||
      "正在检查 幻雪博客 的博客 https://huanxueblog.top/\n",
 | 
			
		||||
      "正在检查 风记星辰 的博客 https://www.thyuu.com\n",
 | 
			
		||||
      "正在检查 呓语梦轩 的博客 https://blog.awaae001.top\n",
 | 
			
		||||
      "正在检查 青桔气球 的博客 https://blog.qjqq.cn/\n",
 | 
			
		||||
      "AlenLiu 的博客 https://blog.alenliu.space/ 无法访问\n",
 | 
			
		||||
      "正在检查 山岳库博 的博客 https://kmar.top/\n",
 | 
			
		||||
      "JackieZhu 的博客 https://blog.zhfan.top/ 无法访问\n",
 | 
			
		||||
      "正在检查 微霞 的博客 https://yuuu.org\n",
 | 
			
		||||
      "Redish 的博客 https://blog.redish101.top/ 无法访问\n",
 | 
			
		||||
      "Rootlex 的博客 https://blog.nalex.top 无法访问\n",
 | 
			
		||||
      "GuKaifeng 的博客 https://gukaifeng.cn/ 无法访问\n",
 | 
			
		||||
      "正在检查 乙未博客 的博客 https://www.yvii.cn\n",
 | 
			
		||||
      "正在检查 东评西就 的博客 https://dongjunke.cn\n",
 | 
			
		||||
      "正在检查 Fgaoxing 的博客 https://www.yt-blog.top/\n",
 | 
			
		||||
      "正在检查 鹊楠 的博客 https://www.quenan.love\n",
 | 
			
		||||
      "正在检查 Ariasaka 的博客 https://blog.yaria.top/\n",
 | 
			
		||||
      "理随 的博客 https://lisui.top/ 无法访问\n",
 | 
			
		||||
      "听风小屋 的博客 https://blog.ifeng.asia/ 无法访问\n",
 | 
			
		||||
      "正在检查 半方池水 的博客 https://uuanqin.top/\n",
 | 
			
		||||
      "正在检查 往日信笺 的博客 https://www.xingmail.cn/\n",
 | 
			
		||||
      "正在检查 皮普 的博客 https://pipuwong.com\n",
 | 
			
		||||
      "正在检查 luo 的博客 https://www.bokelhc.cn\n",
 | 
			
		||||
      "正在检查 星空故事 的博客 https://blog.sinzmise.top/\n",
 | 
			
		||||
      "正在检查 刘郎阁 的博客 https://yjvc.cn/index.php/\n",
 | 
			
		||||
      "正在检查 obaby 的博客 https://h4ck.org.cn/\n",
 | 
			
		||||
      "正在检查 湘铭 的博客 https://xiangming.site/\n",
 | 
			
		||||
      "正在检查 爱吃肉的猫 的博客 https://meuicat.com/\n",
 | 
			
		||||
      "青竹小轩 的博客 https://gyhwd.top/ 无法访问\n",
 | 
			
		||||
      "正在检查 星辰 的博客 https://blog.6ing.top/\n",
 | 
			
		||||
      "正在检查 辞琼 的博客 https://blog.wsq127.top/\n",
 | 
			
		||||
      "正在检查 Efu 的博客 https://www.efu.me/\n",
 | 
			
		||||
      "正在检查 茗辰原 的博客 https://mingcy.xyz/\n",
 | 
			
		||||
      "正在检查 葱苓sama 的博客 https://blog.ciraos.top/\n",
 | 
			
		||||
      "Fiveth 的博客 https://blog.fiveth.cc/ 无法访问\n",
 | 
			
		||||
      "正在检查 HiPeach 的博客 https://blog.opeach.cn\n",
 | 
			
		||||
      "微笔记 的博客 https://flytusky.top 无法访问\n",
 | 
			
		||||
      "正在检查 微生之最 的博客 https://www.bbixb.top/\n",
 | 
			
		||||
      "正在检查 蛋蛋困了 的博客 https://blog.wzwzx.cn/\n",
 | 
			
		||||
      "正在检查 凉心 的博客 https://www.lxink.cn/\n",
 | 
			
		||||
      "正在检查 陌颜Hao 的博客 https://blog.imoyan.top/\n",
 | 
			
		||||
      "正在检查 M.Talen 的博客 https://blog.talen.top/\n",
 | 
			
		||||
      "正在检查 轻笑 的博客 https://www.qcqx.cn/\n",
 | 
			
		||||
      "正在检查 虹墨 的博客 https://www.imaegoo.com/\n",
 | 
			
		||||
      "正在检查 痕迹小站 的博客 https://www.henjinet.com/\n",
 | 
			
		||||
      "Dreamaker 的博客 http://dreamakerr.cn/ 无法访问\n",
 | 
			
		||||
      "正在检查 纸鹿本鹿 的博客 https://blog.zhilu.cyou\n",
 | 
			
		||||
      "SerMs 的博客 https://blog.serms.top/ 无法访问\n",
 | 
			
		||||
      "正在检查 XINGYE 的博客 https://blog.xing-ye.top/\n",
 | 
			
		||||
      "GanSer 的博客 https://www.gan1ser.top/ 无法访问\n",
 | 
			
		||||
      "正在检查 六月是只猫 的博客 https://www.lyszm.com/\n",
 | 
			
		||||
      "正在检查 凌云 的博客 https://www.linyunlink.top/\n",
 | 
			
		||||
      "ZHI-BLOG 的博客 https://blog.zhwei.tech/ 无法访问\n",
 | 
			
		||||
      "正在检查 BUZZ 的博客 https://blog.buzzchat.top/\n",
 | 
			
		||||
      "正在检查 云晓晨 的博客 https://www.catchyxc.com/\n",
 | 
			
		||||
      "安知鱼 的博客 https://blog.anheyu.com/ 无法访问\n",
 | 
			
		||||
      "正在检查 唐志远 的博客 https://fe32.top/\n",
 | 
			
		||||
      "数据处理完成\n",
 | 
			
		||||
      "总共有 77 位朋友,其中 59 位博客可访问,18 位博客无法访问\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "# 引入 check_feed 和 parse_feed 函数\n",
 | 
			
		||||
    "from friend_circle_lite.get_info import fetch_and_process_data\n",
 | 
			
		||||
    "import json\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "result = fetch_and_process_data(\"https://blog.qyliu.top/friend.json\")\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# 将结果保存为 JSON 文件\n",
 | 
			
		||||
    "with open(\"result.json\", \"w\", encoding=\"utf-8\") as f:\n",
 | 
			
		||||
    "    json.dump(result, f, ensure_ascii=False, indent=2)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 5,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "['atom', 'https://www.linyunlink.top/atom.xml']\n",
 | 
			
		||||
      "不可链接的FEED地址:['atom', 'https://www.linyunlink.top/atom.xml']: No connection adapters were found for \"['atom', 'https://www.linyunlink.top/atom.xml']\"\n",
 | 
			
		||||
      "{'website_name': '', 'author': '', 'link': '', 'articles': []}\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "from friend_circle_lite.get_info import check_feed, parse_feed\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "url = check_feed(\"https://www.linyunlink.top/\")\n",
 | 
			
		||||
    "print(url)\n",
 | 
			
		||||
    "feed = parse_feed(url)\n",
 | 
			
		||||
    "print(feed)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 2,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "from friend_circle_lite.get_info import sort_articles_by_time\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "sorted_result = sort_articles_by_time(result)\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# 将结果保存为 JSON 文件\n",
 | 
			
		||||
    "with open(\"result.json\", \"w\", encoding=\"utf-8\") as f:\n",
 | 
			
		||||
    "    json.dump(sorted_result, f, ensure_ascii=False, indent=2)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 4,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "{'website_name': '张洪Heo', 'author': '', 'link': 'https://blog.zhheo.com/', 'articles': [{'title': '新配了一个专门玩英雄联盟的台式机主机,分享下配置和过程', 'author': '', 'link': 'https://blog.zhheo.com/p/9a18f6bb.html', 'published': '2024-06-20 01:55', 'summary': '<p>我只玩英雄联盟这一个网游,之前一直在用我淘汰下来的MacBook Pro 2019 intel i7', 'content': '<p>我只玩英雄联盟这一个网游,之前一直在用我淘汰下来的MacBook Pro 2019 intel i7'}, {'title': '解决iOS邮箱使用QQ邮箱账户无法发送邮件,连接发信服务器失败问题', 'author': '', 'link': 'https://blog.zhheo.com/p/cb3f5ed0.html', 'published': '2024-06-20 01:53', 'summary': '<p>我最近从qq邮箱换到了iOS自带的邮件客户端,一方面是因为自从qq邮箱出了会员之后一堆广告,一方面就是iOS18将支持邮件筛选,我也想用用。但是使用QQ邮箱连接会有一些问题。</p>\\n<p>一方面是需要授权码验证,这个大家都比较清楚,登录的密码不是你的qq密码,而是qq邮箱', 'content': '<p>我最近从qq邮箱换到了iOS自带的邮件客户端,一方面是因为自从qq邮箱出了会员之后一堆广告,一方面就是iOS18将支持邮件筛选,我也想用用。但是使用QQ邮箱连接会有一些问题。</p>\\n<p>一方面是需要授权码验证,这个大家都比较清楚,登录的密码不是你的qq密码,而是qq邮箱'}, {'title': '要排除的文件已经被上传到远程Git仓库怎么办', 'author': '', 'link': 'https://blog.zhheo.com/p/faf797ab.html', 'published': '2024-06-19 09:43', 'summary': '<p>mac有个毒瘤文件就是DS_Store,如果没有全局排除掉那么上传新项目的时候很容易就被上传。还有一些其他的构建文件如果没有设置好<code>.gitignore</code>也会被上传。</p>\\n<p>有的时候排除文件在我们项目过一段时间后才想起来添加,但是那时候远程库已', 'content': '<p>mac有个毒瘤文件就是DS_Store,如果没有全局排除掉那么上传新项目的时候很容易就被上传。还有一些其他的构建文件如果没有设置好<code>.gitignore</code>也会被上传。</p>\\n<p>有的时候排除文件在我们项目过一段时间后才想起来添加,但是那时候远程库已'}, {'title': 'Mac安装java17(openjdk@17)支持M系列芯片和intel芯片', 'author': '', 'link': 'https://blog.zhheo.com/p/6243d392.html', 'published': '2024-06-17 02:46', 'summary': '<p>因为入坑Halo插件开发,装环境研究了一个多小时。主要是错误的安装了版本。Halo的java版本要求是17,这个教程介绍安装java17,通过homebrew来进行安装,然后添加链接即可。</p>\\n<h2 id=\"安装homebrew\"><a', 'content': '<p>因为入坑Halo插件开发,装环境研究了一个多小时。主要是错误的安装了版本。Halo的java版本要求是17,这个教程介绍安装java17,通过homebrew来进行安装,然后添加链接即可。</p>\\n<h2 id=\"安装homebrew\"><a'}, {'title': '是时候去体验下 Apple Vision Pro 了', 'author': '', 'link': 'https://blog.zhheo.com/p/802d5b6b.html', 'published': '2024-06-14 08:19', 'summary': '<p>Vision Pro 已经支持了中国大陆销售,售价29999。当然这不是重点,主要Vision', 'content': '<p>Vision Pro 已经支持了中国大陆销售,售价29999。当然这不是重点,主要Vision'}]}\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "from friend_circle_lite.get_info import fetch_and_process_data, check_feed, parse_feed\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "print(parse_feed(check_feed(\"https://blog.zhheo.com/\")[-1]))"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 4,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "python-dateutil 版本: 2.8.2\n",
 | 
			
		||||
      "requests 版本: 2.31.0\n",
 | 
			
		||||
      "feedparser 版本: 6.0.11\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "import datetime\n",
 | 
			
		||||
    "import dateutil\n",
 | 
			
		||||
    "import requests\n",
 | 
			
		||||
    "import feedparser\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# 打印 datetime 包的版本\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# 打印 python-dateutil 包的版本\n",
 | 
			
		||||
    "print(f\"python-dateutil 版本: {dateutil.__version__}\")\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# 打印 requests 包的版本\n",
 | 
			
		||||
    "print(f\"requests 版本: {requests.__version__}\")\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "# 打印 feedparser 包的版本\n",
 | 
			
		||||
    "print(f\"feedparser 版本: {feedparser.__version__}\")\n"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": null,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": []
 | 
			
		||||
  }
 | 
			
		||||
 ],
 | 
			
		||||
 "metadata": {
 | 
			
		||||
  "kernelspec": {
 | 
			
		||||
   "display_name": "base",
 | 
			
		||||
   "language": "python",
 | 
			
		||||
   "name": "python3"
 | 
			
		||||
  },
 | 
			
		||||
  "language_info": {
 | 
			
		||||
   "codemirror_mode": {
 | 
			
		||||
    "name": "ipython",
 | 
			
		||||
    "version": 3
 | 
			
		||||
   },
 | 
			
		||||
   "file_extension": ".py",
 | 
			
		||||
   "mimetype": "text/x-python",
 | 
			
		||||
   "name": "python",
 | 
			
		||||
   "nbconvert_exporter": "python",
 | 
			
		||||
   "pygments_lexer": "ipython3",
 | 
			
		||||
   "version": "3.11.5"
 | 
			
		||||
  }
 | 
			
		||||
 },
 | 
			
		||||
 "nbformat": 4,
 | 
			
		||||
 "nbformat_minor": 2
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										0
									
								
								friend_circle_lite/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								friend_circle_lite/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										14
									
								
								friend_circle_lite/get_conf.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								friend_circle_lite/get_conf.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,14 @@
 | 
			
		||||
import yaml
 | 
			
		||||
 | 
			
		||||
def load_config(config_file):
 | 
			
		||||
    """
 | 
			
		||||
    加载配置文件。
 | 
			
		||||
    
 | 
			
		||||
    参数:
 | 
			
		||||
    config_file (str): 配置文件的路径。
 | 
			
		||||
    
 | 
			
		||||
    返回:
 | 
			
		||||
    dict: 加载的配置数据。
 | 
			
		||||
    """
 | 
			
		||||
    with open(config_file, 'r', encoding='utf-8') as file:
 | 
			
		||||
        return yaml.safe_load(file)
 | 
			
		||||
							
								
								
									
										264
									
								
								friend_circle_lite/get_info.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										264
									
								
								friend_circle_lite/get_info.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,264 @@
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from dateutil import parser
 | 
			
		||||
import requests
 | 
			
		||||
import feedparser
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor, as_completed
 | 
			
		||||
 | 
			
		||||
# 标准化的请求头
 | 
			
		||||
headers = {
 | 
			
		||||
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
timeout = (5, 10) # 连接超时和读取超时,防止requests接受时间过长
 | 
			
		||||
 | 
			
		||||
def format_published_time(time_str):
 | 
			
		||||
    """
 | 
			
		||||
    格式化发布时间为统一格式 YYYY-MM-DD HH:MM
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        # 尝试自动解析
 | 
			
		||||
        parsed_time = parser.parse(time_str)
 | 
			
		||||
        return parsed_time.strftime('%Y-%m-%d %H:%M')
 | 
			
		||||
    except (ValueError, parser.ParserError):
 | 
			
		||||
        pass
 | 
			
		||||
    
 | 
			
		||||
    time_formats = [
 | 
			
		||||
        '%a, %d %b %Y %H:%M:%S %z',       # Mon, 11 Mar 2024 14:08:32 +0000
 | 
			
		||||
        '%a, %d %b %Y %H:%M:%S GMT',      # Wed, 19 Jun 2024 09:43:53 GMT
 | 
			
		||||
        '%Y-%m-%dT%H:%M:%S%z',            # 2024-03-11T14:08:32+00:00
 | 
			
		||||
        '%Y-%m-%dT%H:%M:%SZ',             # 2024-03-11T14:08:32Z
 | 
			
		||||
        '%Y-%m-%d %H:%M:%S',              # 2024-03-11 14:08:32
 | 
			
		||||
        '%Y-%m-%d'                        # 2024-03-11
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    for fmt in time_formats:
 | 
			
		||||
        try:
 | 
			
		||||
            parsed_time = datetime.strptime(time_str, fmt)
 | 
			
		||||
            return parsed_time.strftime('%Y-%m-%d %H:%M')
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
    # 如果所有格式都无法匹配,返回原字符串或一个默认值
 | 
			
		||||
    return ''
 | 
			
		||||
 | 
			
		||||
def check_feed(blog_url, session):
 | 
			
		||||
    """
 | 
			
		||||
    检查博客的 RSS 或 Atom 订阅链接。
 | 
			
		||||
 | 
			
		||||
    此函数接受一个博客地址,尝试在其后拼接 '/atom.xml', '/rss2.xml' 和 '/feed',并检查这些链接是否可访问。
 | 
			
		||||
    Atom 优先,如果都不能访问,则返回 ['none', 源地址]。
 | 
			
		||||
 | 
			
		||||
    参数:
 | 
			
		||||
    blog_url (str): 博客的基础 URL。
 | 
			
		||||
    session (requests.Session): 用于请求的会话对象。
 | 
			
		||||
 | 
			
		||||
    返回:
 | 
			
		||||
    list: 包含类型和拼接后的链接的列表。如果 atom 链接可访问,则返回 ['atom', atom_url];
 | 
			
		||||
            如果 rss2 链接可访问,则返回 ['rss2', rss_url];
 | 
			
		||||
            如果 feed 链接可访问,则返回 ['feed', feed_url];
 | 
			
		||||
            如果都不可访问,则返回 ['none', blog_url]。
 | 
			
		||||
    """
 | 
			
		||||
    
 | 
			
		||||
    atom_url = blog_url.rstrip('/') + '/atom.xml'
 | 
			
		||||
    rss_url = blog_url.rstrip('/') + '/rss2.xml'
 | 
			
		||||
    feed_url = blog_url.rstrip('/') + '/feed'
 | 
			
		||||
    
 | 
			
		||||
    try:
 | 
			
		||||
        atom_response = session.get(atom_url, headers=headers, timeout=timeout)
 | 
			
		||||
        if atom_response.status_code == 200:
 | 
			
		||||
            return ['atom', atom_url]
 | 
			
		||||
    except requests.RequestException:
 | 
			
		||||
        pass
 | 
			
		||||
    
 | 
			
		||||
    try:
 | 
			
		||||
        rss_response = session.get(rss_url, headers=headers, timeout=timeout)
 | 
			
		||||
        if rss_response.status_code == 200:
 | 
			
		||||
            return ['rss2', rss_url]
 | 
			
		||||
    except requests.RequestException:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        feed_response = session.get(feed_url, headers=headers, timeout=timeout)
 | 
			
		||||
        if feed_response.status_code == 200:
 | 
			
		||||
            return ['feed', feed_url]
 | 
			
		||||
    except requests.RequestException:
 | 
			
		||||
        pass
 | 
			
		||||
    
 | 
			
		||||
    return ['none', blog_url]
 | 
			
		||||
 | 
			
		||||
def parse_feed(url, session, count=5):
 | 
			
		||||
    """
 | 
			
		||||
    解析 Atom 或 RSS2 feed 并返回包含网站名称、作者、原链接和每篇文章详细内容的字典。
 | 
			
		||||
 | 
			
		||||
    此函数接受一个 feed 的地址(atom.xml 或 rss2.xml),解析其中的数据,并返回一个字典结构,
 | 
			
		||||
    其中包括网站名称、作者、原链接和每篇文章的详细内容。
 | 
			
		||||
 | 
			
		||||
    参数:
 | 
			
		||||
    url (str): Atom 或 RSS2 feed 的 URL。
 | 
			
		||||
    session (requests.Session): 用于请求的会话对象。
 | 
			
		||||
    count (int): 获取文章数的最大数。如果小于则全部获取,如果文章数大于则只取前 count 篇文章。
 | 
			
		||||
 | 
			
		||||
    返回:
 | 
			
		||||
    dict: 包含网站名称、作者、原链接和每篇文章详细内容的字典。
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        response = session.get(url, headers=headers, timeout=timeout)
 | 
			
		||||
        response.encoding = 'utf-8'
 | 
			
		||||
        feed = feedparser.parse(response.text)
 | 
			
		||||
        
 | 
			
		||||
        result = {
 | 
			
		||||
            'website_name': feed.feed.title if 'title' in feed.feed else '',
 | 
			
		||||
            'author': feed.feed.author if 'author' in feed.feed else '',
 | 
			
		||||
            'link': feed.feed.link if 'link' in feed.feed else '',
 | 
			
		||||
            'articles': []
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        for i, entry in enumerate(feed.entries):
 | 
			
		||||
            if i >= count:
 | 
			
		||||
                break
 | 
			
		||||
            
 | 
			
		||||
            published = format_published_time(entry.published) if 'published' in entry else ''
 | 
			
		||||
            article = {
 | 
			
		||||
                'title': entry.title if 'title' in entry else '',
 | 
			
		||||
                'author': entry.author if 'author' in entry else '',
 | 
			
		||||
                'link': entry.link if 'link' in entry else '',
 | 
			
		||||
                'published': published,
 | 
			
		||||
                'summary': entry.summary if 'summary' in entry else '',
 | 
			
		||||
                'content': entry.content[0].value if 'content' in entry and entry.content else entry.description if 'description' in entry else ''
 | 
			
		||||
            }
 | 
			
		||||
            result['articles'].append(article)
 | 
			
		||||
        
 | 
			
		||||
        return result
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(f"不可链接的FEED地址:{url}: {e}")
 | 
			
		||||
        return {
 | 
			
		||||
            'website_name': '',
 | 
			
		||||
            'author': '',
 | 
			
		||||
            'link': '',
 | 
			
		||||
            'articles': []
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
def process_friend(friend, session, count):
 | 
			
		||||
    """
 | 
			
		||||
    处理单个朋友的博客信息。
 | 
			
		||||
 | 
			
		||||
    参数:
 | 
			
		||||
    friend (list): 包含朋友信息的列表 [name, blog_url, avatar]。
 | 
			
		||||
    session (requests.Session): 用于请求的会话对象。
 | 
			
		||||
    count (int): 获取每个博客的最大文章数。
 | 
			
		||||
 | 
			
		||||
    返回:
 | 
			
		||||
    dict: 包含朋友博客信息的字典。
 | 
			
		||||
    """
 | 
			
		||||
    name, blog_url, avatar = friend
 | 
			
		||||
    feed_type, feed_url = check_feed(blog_url, session)
 | 
			
		||||
 | 
			
		||||
    if feed_type != 'none':
 | 
			
		||||
        feed_info = parse_feed(feed_url, session, count)
 | 
			
		||||
        articles = [
 | 
			
		||||
            {
 | 
			
		||||
                'title': article['title'],
 | 
			
		||||
                'created': article['published'],
 | 
			
		||||
                'link': article['link'],
 | 
			
		||||
                'author': name,
 | 
			
		||||
                'avatar': avatar
 | 
			
		||||
            }
 | 
			
		||||
            for article in feed_info['articles']
 | 
			
		||||
        ]
 | 
			
		||||
        
 | 
			
		||||
        for article in articles:
 | 
			
		||||
            print(f"{name} 发布了新文章:{article['title']}, 时间:{article['created']}")
 | 
			
		||||
        
 | 
			
		||||
        return {
 | 
			
		||||
            'name': name,
 | 
			
		||||
            'status': 'active',
 | 
			
		||||
            'articles': articles
 | 
			
		||||
        }
 | 
			
		||||
    else:
 | 
			
		||||
        print(f"{name} 的博客 {blog_url} 无法访问")
 | 
			
		||||
        return {
 | 
			
		||||
            'name': name,
 | 
			
		||||
            'status': 'error',
 | 
			
		||||
            'articles': []
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
def fetch_and_process_data(json_url, count=5):
 | 
			
		||||
    """
 | 
			
		||||
    读取 JSON 数据并处理订阅信息,返回统计数据和文章信息。
 | 
			
		||||
 | 
			
		||||
    参数:
 | 
			
		||||
    json_url (str): 包含朋友信息的 JSON 文件的 URL。
 | 
			
		||||
    count (int): 获取每个博客的最大文章数。
 | 
			
		||||
 | 
			
		||||
    返回:
 | 
			
		||||
    dict: 包含统计数据和文章信息的字典。
 | 
			
		||||
    """
 | 
			
		||||
    session = requests.Session()
 | 
			
		||||
    
 | 
			
		||||
    try:
 | 
			
		||||
        response = session.get(json_url, headers=headers, timeout=timeout)
 | 
			
		||||
        friends_data = response.json()
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(f"无法获取该链接:{json_url}, 出现的问题为:{e}")
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    total_friends = len(friends_data['friends'])
 | 
			
		||||
    active_friends = 0
 | 
			
		||||
    error_friends = 0
 | 
			
		||||
    total_articles = 0
 | 
			
		||||
    article_data = []
 | 
			
		||||
 | 
			
		||||
    with ThreadPoolExecutor(max_workers=10) as executor:
 | 
			
		||||
        future_to_friend = {
 | 
			
		||||
            executor.submit(process_friend, friend, session, count): friend
 | 
			
		||||
            for friend in friends_data['friends']
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        for future in as_completed(future_to_friend):
 | 
			
		||||
            friend = future_to_friend[future]
 | 
			
		||||
            try:
 | 
			
		||||
                result = future.result()
 | 
			
		||||
                if result['status'] == 'active':
 | 
			
		||||
                    active_friends += 1
 | 
			
		||||
                    article_data.extend(result['articles'])
 | 
			
		||||
                    total_articles += len(result['articles'])
 | 
			
		||||
                else:
 | 
			
		||||
                    error_friends += 1
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                print(f"处理 {friend} 时发生错误: {e}")
 | 
			
		||||
                error_friends += 1
 | 
			
		||||
 | 
			
		||||
    result = {
 | 
			
		||||
        'statistical_data': {
 | 
			
		||||
            'friends_num': total_friends,
 | 
			
		||||
            'active_num': active_friends,
 | 
			
		||||
            'error_num': error_friends,
 | 
			
		||||
            'article_num': total_articles,
 | 
			
		||||
            'last_updated_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 | 
			
		||||
        },
 | 
			
		||||
        'article_data': article_data
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    print("数据处理完成")
 | 
			
		||||
    print("总共有 %d 位朋友,其中 %d 位博客可访问,%d 位博客无法访问" % (total_friends, active_friends, error_friends))
 | 
			
		||||
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
def sort_articles_by_time(data):
 | 
			
		||||
    """
 | 
			
		||||
    对文章数据按时间排序
 | 
			
		||||
 | 
			
		||||
    参数:
 | 
			
		||||
    data (dict): 包含文章信息的字典
 | 
			
		||||
 | 
			
		||||
    返回:
 | 
			
		||||
    dict: 按时间排序后的文章信息字典
 | 
			
		||||
    """
 | 
			
		||||
    if 'article_data' in data:
 | 
			
		||||
        sorted_articles = sorted(
 | 
			
		||||
            data['article_data'],
 | 
			
		||||
            key=lambda x: datetime.strptime(x['created'], '%Y-%m-%d %H:%M'),
 | 
			
		||||
            reverse=True
 | 
			
		||||
        )
 | 
			
		||||
        data['article_data'] = sorted_articles
 | 
			
		||||
    return data
 | 
			
		||||
							
								
								
									
										5
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,5 @@
 | 
			
		||||
datetime
 | 
			
		||||
python-dateutil==2.9.0.post0
 | 
			
		||||
requests
 | 
			
		||||
feedparser==6.0.11
 | 
			
		||||
PyYAML==6.0.1
 | 
			
		||||
							
								
								
									
										16
									
								
								run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								run.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,16 @@
 | 
			
		||||
# 引入 check_feed 和 parse_feed 函数
 | 
			
		||||
from friend_circle_lite.get_info import fetch_and_process_data, sort_articles_by_time
 | 
			
		||||
from friend_circle_lite.get_conf import load_config
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
# 爬虫部分内容
 | 
			
		||||
config = load_config("./conf.yml")
 | 
			
		||||
if config["spider_settings"]["enable"]:
 | 
			
		||||
    print("爬虫已启用")
 | 
			
		||||
    json_url = config['spider_settings']['json_url']
 | 
			
		||||
    article_count = config['spider_settings']['article_count']
 | 
			
		||||
    print("正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章".format(json_url=json_url, article_count=article_count))
 | 
			
		||||
    result = fetch_and_process_data(json_url=json_url, count=article_count)
 | 
			
		||||
    sorted_result = sort_articles_by_time(result)
 | 
			
		||||
    with open("all.json", "w", encoding="utf-8") as f:
 | 
			
		||||
        json.dump(sorted_result, f, ensure_ascii=False, indent=2)
 | 
			
		||||
		Reference in New Issue
	
	Block a user