diff --git a/deploy.sh b/deploy.sh index b5bb679..8534dbc 100644 --- a/deploy.sh +++ b/deploy.sh @@ -1,2 +1,46 @@ #!/bin/bash -nohup python3 server.py > grab.log 2>&1 & + +# 获取当前脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# 定义日志文件路径 +CRON_LOG_FILE="$SCRIPT_DIR/cron_grab.log" +API_LOG_FILE="$SCRIPT_DIR/api_grab.log" + +# 定义要执行的命令 +COMMAND="python3 $SCRIPT_DIR/run.py" + +# 定义定时任务的执行间隔(例如每四小时一次) +INTERVAL="4" + +# 添加定时任务到 crontab +(crontab -l 2>/dev/null; echo "0 */$INTERVAL * * * $COMMAND >> $CRON_LOG_FILE 2>&1 && echo '运行成功'") | crontab - + +echo "====================================" +echo "定时爬取 成功设置,时间间隔:4h" +echo "定时任务日志:$CRON_LOG_FILE" +echo "====================================" + +# 后台运行服务端,将数据映射到API +echo "****正在启动API服务****" +nohup python3 $SCRIPT_DIR/server.py > $API_LOG_FILE 2>&1 & +API_PID=$! +sleep 5 # 等待API服务启动,可能需要调整等待时间 + +echo "API 服务已启动:http://localhost:1223" +echo "API 服务日志:$API_LOG_FILE" +echo "API 服务进程号:$API_PID" +echo "API 服务关闭命令:kill -9 $API_PID" +echo "文档地址:https://blog.qyliu.top/posts/4dc716ec/" +echo "====================================" + +# 用户选择是否执行爬取 +read -p "选择操作:0 - 退出, 1 - 执行一次爬取: " USER_CHOICE + +if [ "$USER_CHOICE" -eq 1 ]; then + echo "****正在执行一次爬取****" + python3 $SCRIPT_DIR/run.py + echo "****爬取成功****" +else + echo "退出选项被选择。" +fi diff --git a/grab.log b/grab.log deleted file mode 100644 index 5a84c82..0000000 --- a/grab.log +++ /dev/null @@ -1,3 +0,0 @@ -2024-07-28 21:23:58,910 - INFO - ʼץȡ... -2024-07-28 21:23:58,914 - INFO - ڴ https://blog.qyliu.top/friend.json лȡÿͻȡ 5 ƪ -2024-07-28 21:25:05,362 - INFO - ץȡɹ diff --git a/readme.md b/readme.md index 40a1fe2..edb456f 100644 --- a/readme.md +++ b/readme.md @@ -10,22 +10,28 @@ ## 开发进度 +### 2024-08-03 + +* 将自部署分离为API服务和定时爬取 +* 尝试更加系统的启动脚本 +* 删除server.py中的爬取内容,使用定时任务crontab实现 + ### 2024-07-28 * 自部署添加跨域请求 * 修复内存占用异常问题 * 将html资源分开存放,实现更加美观的页面 -### 2024-07-26 +
+查看更多 +

2024-07-26

* 自部署添加跨域请求 * 添加`/rss.xml`,`/feed/`,`feed.xml`接口的爬取,提高兼容性 * 修复PJAX下会多次出现模态框的问题,并且切换页面不消失 * 修复模态框宽度问题,添加日历图标以更加美观 -
-查看更多 -### 2024-07-25 +

2024-07-25

* 自部署正在开发中,仅供测试 * 添加`/errors.json`,用于获取丢失友链数据,提高自定义程度 diff --git a/server.py b/server.py index c3af5e0..a09c076 100644 --- a/server.py +++ b/server.py @@ -24,31 +24,9 @@ app.add_middleware( ) # 配置日志记录 -log_file = "grab.log" +log_file = "cron_grab" logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -data_lock = Lock() - -def fetch_articles(): - logging.info("开始抓取文章...") - try: - config = load_config("./conf.yaml") - if config["spider_settings"]["enable"]: - json_url = config['spider_settings']['json_url'] - article_count = config['spider_settings']['article_count'] - logging.info(f"正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章") - result, errors = fetch_and_process_data(json_url=json_url, count=article_count) - sorted_result = sort_articles_by_time(result) - with open("all.json", "w", encoding="utf-8") as f: - json.dump(sorted_result, f, ensure_ascii=False, indent=2) - with open("errors.json", "w", encoding="utf-8") as f: - json.dump(errors, f, ensure_ascii=False, indent=2) - logging.info("文章抓取成功") - else: - logging.warning("抓取设置在配置中被禁用。") - except Exception as e: - logging.error(f"抓取文章时出错: {e}") - @app.get("/", response_class=HTMLResponse) async def root(): try: @@ -95,24 +73,11 @@ async def get_random_article(): except json.JSONDecodeError: return JSONResponse(content={"error": "Failed to decode JSON"}, status_code=500) -def schedule_tasks(): - schedule.every(4).hours.do(fetch_articles) - while True: - schedule.run_pending() - time.sleep(1) - if __name__ == '__main__': # 清空日志文件 if os.path.exists(log_file): with open(log_file, 'w'): pass - - fetch_articles() # 启动时立即抓取一次 - - # 启动调度任务线程 - task_thread = Thread(target=schedule_tasks) - task_thread.start() - # 启动FastAPI应用 import uvicorn uvicorn.run(app, host='0.0.0.0', port=1223)