接的小私活,目标爬取下来题库并整理到Excel里。
目标站点:https://www.mayizhixue.cn/
import requests
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl import load_workbook
import os
common_headers = {
'Authorization': 'TOKEN',
}
record_id = 1
wb = load_workbook(filename='sample.xlsx')
ws = wb.active
rows = ws.rows
def get_target_row_number():
rows = ws.rows
idx = 1
for row in rows:
# for cell in row:
# print(cell.value, end=' ')
# print()
if row[0].value is None:
return idx
idx = idx + 1
return idx
def write_row(row, kIndex=None):
global record_id
rowNumber = get_target_row_number()
# print(f"当前行数:{rowNumber}")
if kIndex is None:
ws.cell(row=rowNumber, column=1).value = record_id
else:
ws.cell(row=rowNumber, column=1).value = str(record_id) + '.' + str(kIndex)
ws.cell(row=rowNumber, column=2).value = row.get('title', '')
ws.cell(row=rowNumber, column=3).value = row.get('type', '')
# ws.cell(row=rowNumber, column=4).value = row.get('type') # 分数
# ws.cell(row=rowNumber, column=5).value = row.get('type') # 难度
option_idx = 0
for option in row.get('options', []):
if 6 + option_idx >= 11:
break
ws.cell(row=rowNumber, column=6 + option_idx).value = option
option_idx = option_idx + 1
ws.cell(row=rowNumber, column=11).value = row.get('answer', '')
ws.cell(row=rowNumber, column=12).value = row.get('analysis', '')
if kIndex is None:
record_id = record_id + 1
def map_type_kv(key):
# 1-单选题 2-多选题 6-共享题干题
type = '单选题'
if key in ('1', 1):
type = '单选题'
elif key in ('2', 2):
type = '多选题'
elif key in ('3', 3):
type = '不定项选择题'
elif key in ('4', 4):
type = '判断题'
elif key in ('6', 6):
type = '材料题'
else:
print('不支持的类型:%s' % key)
exit()
return type
def get_test_question(sectionId):
params = {
'sectionId': sectionId,
'type': '2',
}
response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/querySubjectList', params=params,
headers=common_headers).json()
data = response.get('data')
handle_data_2_excel(data)
def handle_data_2_excel(data):
for i in data:
# 此时 i 为对象,取出所有key并遍历
for key in i.keys():
type = map_type_kv(key)
# 开始遍历这一题型的所有题目
if type in ['单选题', '多选题', '不定项选择题', '判断题']:
for j in i.get(key):
row = {
'title': j.get('issue'),
'type': type,
'options': [],
'answer': j.get('answer'),
'analysis': j.get('analysis')
}
options = j.get('sOption')
# 使用|分割选项
options = options.split('|')
for k in options:
# j为A.选项内容 所以取第三个字符开始
row['options'].append(k[2:])
write_row(row)
elif type == '材料题':
for j in i.get(key):
row = {
'title': j.get('stem'),
'type': type
}
write_row(row)
# 开始爬下面的point
kIndex = 1
for k in j.get('childre', []):
subtype = map_type_kv(k.get('subType'))
row = {
'title': k.get('issue'),
'type': subtype,
'options': [],
'answer': k.get('answer'),
'analysis': k.get('analysis')
}
options = k.get('sOption')
# 使用|分割选项
options = options.split('|')
for opt in options:
# j为A.选项内容 所以取第三个字符开始
row['options'].append(opt[2:])
write_row(row, kIndex)
kIndex = kIndex + 1
def get_exam_question(paperId):
response = requests.get(
'https://wx.yiwenjy.cn/yunlian_pc/queryoPaperSubjectList', params={
'paperId': paperId,
'mode': '2'
}, headers=common_headers).json()
data = response.get('data')
handle_data_2_excel(data)
def get_catalogue(courseName, courseId):
response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/querySectionList', params={
'courseId': courseId
}, headers=common_headers).json()
data = response.get('data')
for i in data:
print(f"当前章节ID:{i.get('id')},章节名称:{i.get('sectionName')}")
# 创建相关文件夹
if not os.path.exists(courseName + '/' + i.get('sectionName')):
os.makedirs(courseName + '/' + i.get('sectionName'))
# 这里需要一直向下判断是否有子节点
copy_i = i
# dfs算法
access_next_level(courseName + "/", copy_i)
def access_next_level(path, item):
global wb, ws, rows, record_id
# dfs算法 开始不断找下级 向上返回
if item.get('children') is not None:
path = path + item.get('sectionName') + '/'
for i in item.get('children'):
access_next_level(path, i)
else:
print(f"当前小节ID:{item.get('id')},小节名称:{item.get('sectionName')}")
record_id = 1
wb = load_workbook(filename='sample.xlsx')
ws = wb.active
rows = ws.rows
get_test_question(item.get('id'))
# 判断目录是否存在
if not os.path.exists(path):
os.makedirs(path)
wb.save(f'{path}/{item.get("sectionName")}.xlsx')
def get_product_course_info(id):
response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryProductCourse', params={
'id': id
}, headers=common_headers).json()
data = response.get('data')
"""
每个ITEM
courseName:"中国建设银行VIP"
examId:"43d8625d21614cab9f6a2e323e0cd4db"
id:"1686999432228376576"
"""
return data
def query_paper_type_list(id):
response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryPaperTypeList', params={
'courseId': id
}, headers=common_headers).json()
"""
"id": "1",
"paperTypeName": "章节练习",
"icon": null,
"version": null,
"isSection": null,
"hasSection": null
"""
return response.get('data')
def get_li_nian_zhen_ti_list(id, paperTypeId):
response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryPaperList', params={
'courseId': id,
'paperTypeId': paperTypeId
}, headers=common_headers).json()
"""
每个ITEM
"id": "1703299201187844096",
"paperName": "2022年银行招聘笔试《中国建设银行》试题",
"onlineTime": "2023-09-17 00:00:00",
"referenNumber": 139,
"tryBuy": 1,
"hasMake": 3,
"mode": null
"""
return response.get('data')
course = [
{'id': 'f753f9934c60427fadfba664229a8487', 'name': '2024年军队文职人员招聘《公共科目》题库'}
]
for courseItem in course:
# 创建科目的文件夹
if not os.path.exists(courseItem.get('name')):
os.makedirs(courseItem.get('name'))
product_course_info = get_product_course_info(courseItem.get('id'))
for product_course in product_course_info:
# 查询当前科目下的试卷类型列表
paper_type_list = query_paper_type_list(product_course.get('id'))
for paper_type in paper_type_list:
print(f"当前科目:{product_course.get('courseName')},当前试卷类型:{paper_type.get('paperTypeName')}")
if paper_type.get('paperTypeName') == '章节练习':
get_catalogue(courseItem.get("name"), product_course.get('id'))
elif paper_type.get('paperTypeName') in ('历年真题', '考前点题', '模拟试卷', '预测试卷', '考前点题'):
li_nian_zhen_ti_list = get_li_nian_zhen_ti_list(product_course.get('id'), paper_type.get('id'))
for li_nian_zhen_ti in li_nian_zhen_ti_list:
record_id = 1
wb = load_workbook(filename='sample.xlsx')
ws = wb.active
rows = ws.rows
print(f"当前试卷ID:{li_nian_zhen_ti.get('id')},试卷名称:{li_nian_zhen_ti.get('paperName')}")
get_exam_question(li_nian_zhen_ti.get('id'))
wb.save(
f'{courseItem.get("name")}/{courseItem.get("name")}-{li_nian_zhen_ti.get("paperName")}.xlsx')
else:
print("不支持的试卷类型:%s" % paper_type.get('paperTypeName'))
exit()
评论 (0)