代码
# -*- coding: utf-8 -*-
"""
# @file name : ICCV2023-PDF-downloader.py
# @author : 蔡不菜和他的uU们 https://www.caibucai.top/
# @date : 2023-10-05 15:31
# @brief : ICCV 2023 paper 下载
"""
import threading
import os
import urllib3
import requests
import xml
from bs4 import BeautifulSoup
start_url = 'https://openaccess.thecvf.com/ICCV2023'
base_url = "https://openaccess.thecvf.com"
dst_dir = './ICCV2023/'
response = requests.get(start_url)
soup = BeautifulSoup(response.text, 'html.parser')
# 查找表单
form = soup.find('form')
print(form)
action = form.get('action')
search_url = base_url + action
print('search_url: ', search_url)
# 关键词 拼接
keyword = input('please input search key:\n')
# keyword = 'annomaly'
data = {"query": keyword} # 替换为实际的搜索参数名和值
headers = {
"User-Agent": "LogStatistic"
}
pdf_urls = []
# 发送POST请求获取搜索结果页面
search_response = requests.post(search_url, data=data, headers=headers)
if search_response.status_code == 200:
print('search success')
search_soup = BeautifulSoup(search_response.text, "html.parser") # 使用html.parser解析器,可根据需要选择其他解析器
print(search_soup)
# 根据页面结构和标签选择器提取搜索结果信息
pdf_links = search_soup.find_all("a") # 替换为实际的搜索结果的CSS选择器
# 遍历搜索结果并提取相关信息
pdf_cnt = 0
for pdf_link in pdf_links:
# print(pdf_link)
if pdf_link.text == 'pdf':
print(pdf_link)
pdf_urls.append(base_url + pdf_link.get('href'))
pdf_cnt += 1
print("find ", pdf_cnt, " papers!")
else:
print("Failed to send, please check")
print('start downloading ')
dst_path = os.path.join(dst_dir, keyword)
if not os.path.exists(dst_path):
os.makedirs(dst_path)
def download_pdf(url, dst_path):
response = requests.get(url)
if response.status_code == 200:
# 提取文件名
file_name = url.split("/")[-1]
# 保存文件
with open(dst_path + '/' + file_name, "wb") as file:
file.write(response.content)
print(f"Downloaded: {file_name}")
else:
print(f"Failed to download: {url}")
# 创建线程列表
threads = []
for url in pdf_urls:
# 创建线程,并传递下载函数和URL参数
thread = threading.Thread(target=download_pdf, args=(url,dst_path))
threads.append(thread)
# 启动所有线程
for thread in threads:
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print("All PDF files downloaded.")
说明
有问题,后期维护更新,欢迎留言、进群讨论或私聊:【群号:392784757】