蚂蚁智学题库爬虫并整理到Excel

接的小私活，目标爬取下来题库并整理到Excel里。
目标站点:https://www.mayizhixue.cn/
import requests
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl import load_workbook
import os

common_headers = {
    'Authorization': 'TOKEN',
}

record_id = 1
wb = load_workbook(filename='sample.xlsx')
ws = wb.active
rows = ws.rows


def get_target_row_number():
    rows = ws.rows
    idx = 1
    for row in rows:
        # for cell in row:
        #     print(cell.value, end=' ')
        # print()

        if row[0].value is None:
            return idx
        idx = idx + 1
    return idx


def write_row(row, kIndex=None):
    global record_id

    rowNumber = get_target_row_number()
    # print(f"当前行数:{rowNumber}")

    if kIndex is None:
        ws.cell(row=rowNumber, column=1).value = record_id
    else:
        ws.cell(row=rowNumber, column=1).value = str(record_id) + '.' + str(kIndex)

    ws.cell(row=rowNumber, column=2).value = row.get('title', '')
    ws.cell(row=rowNumber, column=3).value = row.get('type', '')
    # ws.cell(row=rowNumber, column=4).value = row.get('type')  # 分数
    # ws.cell(row=rowNumber, column=5).value = row.get('type')  # 难度

    option_idx = 0
    for option in row.get('options', []):
        if 6 + option_idx >= 11:
            break
        ws.cell(row=rowNumber, column=6 + option_idx).value = option
        option_idx = option_idx + 1

    ws.cell(row=rowNumber, column=11).value = row.get('answer', '')
    ws.cell(row=rowNumber, column=12).value = row.get('analysis', '')
    if kIndex is None:
        record_id = record_id + 1


def map_type_kv(key):
    # 1-单选题 2-多选题 6-共享题干题
    type = '单选题'
    if key in ('1', 1):
        type = '单选题'
    elif key in ('2', 2):
        type = '多选题'
    elif key in ('3', 3):
        type = '不定项选择题'
    elif key in ('4', 4):
        type = '判断题'
    elif key in ('6', 6):
        type = '材料题'
    else:
        print('不支持的类型:%s' % key)
        exit()
    return type


def get_test_question(sectionId):
    params = {
        'sectionId': sectionId,
        'type': '2',
    }

    response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/querySubjectList', params=params,
                            headers=common_headers).json()
    data = response.get('data')
    handle_data_2_excel(data)


def handle_data_2_excel(data):
    for i in data:
        # 此时 i 为对象,取出所有key并遍历
        for key in i.keys():
            type = map_type_kv(key)

            # 开始遍历这一题型的所有题目
            if type in ['单选题', '多选题', '不定项选择题', '判断题']:
                for j in i.get(key):
                    row = {
                        'title': j.get('issue'),
                        'type': type,
                        'options': [],
                        'answer': j.get('answer'),
                        'analysis': j.get('analysis')
                    }

                    options = j.get('sOption')
                    # 使用|分割选项
                    options = options.split('|')
                    for k in options:
                        # j为A.选项内容  所以取第三个字符开始
                        row['options'].append(k[2:])
                    write_row(row)

            elif type == '材料题':
                for j in i.get(key):
                    row = {
                        'title': j.get('stem'),
                        'type': type
                    }
                    write_row(row)
                    # 开始爬下面的point
                    kIndex = 1
                    for k in j.get('childre', []):
                        subtype = map_type_kv(k.get('subType'))
                        row = {
                            'title': k.get('issue'),
                            'type': subtype,
                            'options': [],
                            'answer': k.get('answer'),
                            'analysis': k.get('analysis')
                        }
                        options = k.get('sOption')
                        # 使用|分割选项
                        options = options.split('|')
                        for opt in options:
                            # j为A.选项内容  所以取第三个字符开始
                            row['options'].append(opt[2:])
                        write_row(row, kIndex)
                        kIndex = kIndex + 1


def get_exam_question(paperId):
    response = requests.get(
        'https://wx.yiwenjy.cn/yunlian_pc/queryoPaperSubjectList', params={
            'paperId': paperId,
            'mode': '2'
        }, headers=common_headers).json()
    data = response.get('data')
    handle_data_2_excel(data)


def get_catalogue(courseName, courseId):
    response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/querySectionList', params={
        'courseId': courseId
    }, headers=common_headers).json()
    data = response.get('data')
    for i in data:
        print(f"当前章节ID:{i.get('id')},章节名称:{i.get('sectionName')}")
        # 创建相关文件夹
        if not os.path.exists(courseName + '/' + i.get('sectionName')):
            os.makedirs(courseName + '/' + i.get('sectionName'))

        # 这里需要一直向下判断是否有子节点
        copy_i = i
        # dfs算法
        access_next_level(courseName + "/", copy_i)


def access_next_level(path, item):
    global wb, ws, rows, record_id
    # dfs算法 开始不断找下级 向上返回
    if item.get('children') is not None:
        path = path + item.get('sectionName') + '/'
        for i in item.get('children'):
            access_next_level(path, i)
    else:
        print(f"当前小节ID:{item.get('id')},小节名称:{item.get('sectionName')}")
        record_id = 1
        wb = load_workbook(filename='sample.xlsx')
        ws = wb.active
        rows = ws.rows

        get_test_question(item.get('id'))

        # 判断目录是否存在
        if not os.path.exists(path):
            os.makedirs(path)
        wb.save(f'{path}/{item.get("sectionName")}.xlsx')


def get_product_course_info(id):
    response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryProductCourse', params={
        'id': id
    }, headers=common_headers).json()
    data = response.get('data')
    """
    每个ITEM
    courseName:"中国建设银行VIP"
    examId:"43d8625d21614cab9f6a2e323e0cd4db"
    id:"1686999432228376576"
    """
    return data


def query_paper_type_list(id):
    response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryPaperTypeList', params={
        'courseId': id
    }, headers=common_headers).json()
    """
    "id": "1",
    "paperTypeName": "章节练习",
    "icon": null,
    "version": null,
    "isSection": null,
    "hasSection": null
    """
    return response.get('data')


def get_li_nian_zhen_ti_list(id, paperTypeId):
    response = requests.get('https://wx.yiwenjy.cn/yunlian_pc/queryPaperList', params={
        'courseId': id,
        'paperTypeId': paperTypeId
    }, headers=common_headers).json()
    """
    每个ITEM
     "id": "1703299201187844096",
    "paperName": "2022年银行招聘笔试《中国建设银行》试题",
    "onlineTime": "2023-09-17 00:00:00",
    "referenNumber": 139,
    "tryBuy": 1,
    "hasMake": 3,
    "mode": null
    """
    return response.get('data')


course = [
    {'id': 'f753f9934c60427fadfba664229a8487', 'name': '2024年军队文职人员招聘《公共科目》题库'}
]

for courseItem in course:
    # 创建科目的文件夹
    if not os.path.exists(courseItem.get('name')):
        os.makedirs(courseItem.get('name'))
    product_course_info = get_product_course_info(courseItem.get('id'))
    for product_course in product_course_info:
        # 查询当前科目下的试卷类型列表
        paper_type_list = query_paper_type_list(product_course.get('id'))
        for paper_type in paper_type_list:
            print(f"当前科目:{product_course.get('courseName')},当前试卷类型:{paper_type.get('paperTypeName')}")

            if paper_type.get('paperTypeName') == '章节练习':
                get_catalogue(courseItem.get("name"), product_course.get('id'))
            elif paper_type.get('paperTypeName') in ('历年真题', '考前点题', '模拟试卷', '预测试卷', '考前点题'):
                li_nian_zhen_ti_list = get_li_nian_zhen_ti_list(product_course.get('id'), paper_type.get('id'))
                for li_nian_zhen_ti in li_nian_zhen_ti_list:
                    record_id = 1
                    wb = load_workbook(filename='sample.xlsx')
                    ws = wb.active
                    rows = ws.rows
                    print(f"当前试卷ID:{li_nian_zhen_ti.get('id')},试卷名称:{li_nian_zhen_ti.get('paperName')}")
                    get_exam_question(li_nian_zhen_ti.get('id'))
                    wb.save(
                        f'{courseItem.get("name")}/{courseItem.get("name")}-{li_nian_zhen_ti.get("paperName")}.xlsx')
            else:
                print("不支持的试卷类型:%s" % paper_type.get('paperTypeName'))
                exit()