🆕尝试使用脚本实现定时任务和api
This commit is contained in:
parent
f9fddaa63e
commit
8afdc06741
46
deploy.sh
46
deploy.sh
@ -1,2 +1,46 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
nohup python3 server.py > grab.log 2>&1 &
|
|
||||||
|
# 获取当前脚本所在目录
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
|
||||||
|
# 定义日志文件路径
|
||||||
|
CRON_LOG_FILE="$SCRIPT_DIR/cron_grab.log"
|
||||||
|
API_LOG_FILE="$SCRIPT_DIR/api_grab.log"
|
||||||
|
|
||||||
|
# 定义要执行的命令
|
||||||
|
COMMAND="python3 $SCRIPT_DIR/run.py"
|
||||||
|
|
||||||
|
# 定义定时任务的执行间隔(例如每四小时一次)
|
||||||
|
INTERVAL="4"
|
||||||
|
|
||||||
|
# 添加定时任务到 crontab
|
||||||
|
(crontab -l 2>/dev/null; echo "0 */$INTERVAL * * * $COMMAND >> $CRON_LOG_FILE 2>&1 && echo '运行成功'") | crontab -
|
||||||
|
|
||||||
|
echo "===================================="
|
||||||
|
echo "定时爬取 成功设置,时间间隔:4h"
|
||||||
|
echo "定时任务日志:$CRON_LOG_FILE"
|
||||||
|
echo "===================================="
|
||||||
|
|
||||||
|
# 后台运行服务端,将数据映射到API
|
||||||
|
echo "****正在启动API服务****"
|
||||||
|
nohup python3 $SCRIPT_DIR/server.py > $API_LOG_FILE 2>&1 &
|
||||||
|
API_PID=$!
|
||||||
|
sleep 5 # 等待API服务启动,可能需要调整等待时间
|
||||||
|
|
||||||
|
echo "API 服务已启动:http://localhost:1223"
|
||||||
|
echo "API 服务日志:$API_LOG_FILE"
|
||||||
|
echo "API 服务进程号:$API_PID"
|
||||||
|
echo "API 服务关闭命令:kill -9 $API_PID"
|
||||||
|
echo "文档地址:https://blog.qyliu.top/posts/4dc716ec/"
|
||||||
|
echo "===================================="
|
||||||
|
|
||||||
|
# 用户选择是否执行爬取
|
||||||
|
read -p "选择操作:0 - 退出, 1 - 执行一次爬取: " USER_CHOICE
|
||||||
|
|
||||||
|
if [ "$USER_CHOICE" -eq 1 ]; then
|
||||||
|
echo "****正在执行一次爬取****"
|
||||||
|
python3 $SCRIPT_DIR/run.py
|
||||||
|
echo "****爬取成功****"
|
||||||
|
else
|
||||||
|
echo "退出选项被选择。"
|
||||||
|
fi
|
||||||
|
3
grab.log
3
grab.log
@ -1,3 +0,0 @@
|
|||||||
2024-07-28 21:23:58,910 - INFO - 开始抓取文章...
|
|
||||||
2024-07-28 21:23:58,914 - INFO - 正在从 https://blog.qyliu.top/friend.json 中获取,每个博客获取 5 篇文章
|
|
||||||
2024-07-28 21:25:05,362 - INFO - 文章抓取成功
|
|
14
readme.md
14
readme.md
@ -10,22 +10,28 @@
|
|||||||
|
|
||||||
## 开发进度
|
## 开发进度
|
||||||
|
|
||||||
|
### 2024-08-03
|
||||||
|
|
||||||
|
* 将自部署分离为API服务和定时爬取
|
||||||
|
* 尝试更加系统的启动脚本
|
||||||
|
* 删除server.py中的爬取内容,使用定时任务crontab实现
|
||||||
|
|
||||||
### 2024-07-28
|
### 2024-07-28
|
||||||
|
|
||||||
* 自部署添加跨域请求
|
* 自部署添加跨域请求
|
||||||
* 修复内存占用异常问题
|
* 修复内存占用异常问题
|
||||||
* 将html资源分开存放,实现更加美观的页面
|
* 将html资源分开存放,实现更加美观的页面
|
||||||
|
|
||||||
### 2024-07-26
|
<details>
|
||||||
|
<summary>查看更多</summary>
|
||||||
|
<h3>2024-07-26</h3>
|
||||||
|
|
||||||
* 自部署添加跨域请求
|
* 自部署添加跨域请求
|
||||||
* 添加`/rss.xml`,`/feed/`,`feed.xml`接口的爬取,提高兼容性
|
* 添加`/rss.xml`,`/feed/`,`feed.xml`接口的爬取,提高兼容性
|
||||||
* 修复PJAX下会多次出现模态框的问题,并且切换页面不消失
|
* 修复PJAX下会多次出现模态框的问题,并且切换页面不消失
|
||||||
* 修复模态框宽度问题,添加日历图标以更加美观
|
* 修复模态框宽度问题,添加日历图标以更加美观
|
||||||
|
|
||||||
<details>
|
<h3>2024-07-25</h3>
|
||||||
<summary>查看更多</summary>
|
|
||||||
### 2024-07-25
|
|
||||||
|
|
||||||
* 自部署正在开发中,仅供测试
|
* 自部署正在开发中,仅供测试
|
||||||
* 添加`/errors.json`,用于获取丢失友链数据,提高自定义程度
|
* 添加`/errors.json`,用于获取丢失友链数据,提高自定义程度
|
||||||
|
37
server.py
37
server.py
@ -24,31 +24,9 @@ app.add_middleware(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 配置日志记录
|
# 配置日志记录
|
||||||
log_file = "grab.log"
|
log_file = "cron_grab"
|
||||||
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
data_lock = Lock()
|
|
||||||
|
|
||||||
def fetch_articles():
|
|
||||||
logging.info("开始抓取文章...")
|
|
||||||
try:
|
|
||||||
config = load_config("./conf.yaml")
|
|
||||||
if config["spider_settings"]["enable"]:
|
|
||||||
json_url = config['spider_settings']['json_url']
|
|
||||||
article_count = config['spider_settings']['article_count']
|
|
||||||
logging.info(f"正在从 {json_url} 中获取,每个博客获取 {article_count} 篇文章")
|
|
||||||
result, errors = fetch_and_process_data(json_url=json_url, count=article_count)
|
|
||||||
sorted_result = sort_articles_by_time(result)
|
|
||||||
with open("all.json", "w", encoding="utf-8") as f:
|
|
||||||
json.dump(sorted_result, f, ensure_ascii=False, indent=2)
|
|
||||||
with open("errors.json", "w", encoding="utf-8") as f:
|
|
||||||
json.dump(errors, f, ensure_ascii=False, indent=2)
|
|
||||||
logging.info("文章抓取成功")
|
|
||||||
else:
|
|
||||||
logging.warning("抓取设置在配置中被禁用。")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"抓取文章时出错: {e}")
|
|
||||||
|
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
async def root():
|
async def root():
|
||||||
try:
|
try:
|
||||||
@ -95,24 +73,11 @@ async def get_random_article():
|
|||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
return JSONResponse(content={"error": "Failed to decode JSON"}, status_code=500)
|
return JSONResponse(content={"error": "Failed to decode JSON"}, status_code=500)
|
||||||
|
|
||||||
def schedule_tasks():
|
|
||||||
schedule.every(4).hours.do(fetch_articles)
|
|
||||||
while True:
|
|
||||||
schedule.run_pending()
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 清空日志文件
|
# 清空日志文件
|
||||||
if os.path.exists(log_file):
|
if os.path.exists(log_file):
|
||||||
with open(log_file, 'w'):
|
with open(log_file, 'w'):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
fetch_articles() # 启动时立即抓取一次
|
|
||||||
|
|
||||||
# 启动调度任务线程
|
|
||||||
task_thread = Thread(target=schedule_tasks)
|
|
||||||
task_thread.start()
|
|
||||||
|
|
||||||
# 启动FastAPI应用
|
# 启动FastAPI应用
|
||||||
import uvicorn
|
import uvicorn
|
||||||
uvicorn.run(app, host='0.0.0.0', port=1223)
|
uvicorn.run(app, host='0.0.0.0', port=1223)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user