python3从腾讯共享文档下载excel

发布时间 2023-07-03 16:30:54作者: 小尾巴想看雪

脚本内容如下:

 

点击查看代码
# coding: UTF-8
import json
import os
import re
import sys
import time
from datetime import datetime
from time import sleep
import click
import hashlib
import pandas as pd
import jsonpath as jp
import requests
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from autoApiDeamo import dealwith_data


class TengXunDocument():

    def __init__(self, document_url, local_pad_id, cookie_value):
        # excel文档地址
        self.document_url = document_url
        # 此值每一份腾讯文档有一个,需要手动获取
        self.localPadId = local_pad_id
        self.headers = {
            'accept': '*/*',  # 'application/json',
            'Cookie': cookie_value,
            'referer': 'https://docs.qq.com/sheet/DWVl3elpDT2tocHpj?tab=BB08J3',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'

        }

    def get_now_user_index(self):
        """
        # 获取当前用户信息,供创建下载任务使用
        :return:
            # nowUserIndex = '4883730fe8b94fbdb94da26a9a63b688'
            # uid = '144115225804776585'
            # utype = 'wx'
        """
        response_body = requests.get(url=self.document_url, headers=self.headers, verify=False)
        parser = BeautifulSoup(response_body.content, 'html.parser')
        global_multi_user_list = re.findall(r'"nowUserIndex": "\S+"', str(parser))
        if global_multi_user_list:
            now_user_index = global_multi_user_list[0][len('"nowUserIndex": '):].replace('"', '')
            # print('nowUserIndex:', now_user_index)
            return now_user_index
        return '未获取到now_user_index的值'

    def export_excel_task(self, export_excel_url):
        """
        导出excel文件任务,供查询文件数据准备进度
        :return:
        """
        body = {
            'docId': self.localPadId,
            'version': '2',
            'exportSource': 'client',
        }
        print()
        res = requests.post(url=export_excel_url,
                            headers=self.headers, data=body, verify=False)
        operation_id = res.json()['operationId']
        return operation_id

    def download_excel(self, check_progress_url, file_name):
        """
        下载excel文件,windows系统下文件会自动覆盖(不被占用的情况下),下载时会将函数一起下载
        :return:
        """
        # 拿到下载excel文件的url
        start_time = time.time()
        file_url = ''
        while True:
            res = requests.get(url=check_progress_url, headers=self.headers, verify=False)
            progress = res.json()['progress']
            if progress == 100:
                file_url = res.json()['file_url']
                break
            elif time.time() - start_time > 30:
                print("数据准备超时,请排查")
                break
        if file_url:
            self.headers['content-type'] = 'application/octet-stream'
            res = requests.get(url=file_url, headers=self.headers, verify=False)
            with open(file_name, 'wb') as f:
                f.write(res.content)
            print('下载成功,文件名: ' + file_name)
        else:
            print("下载文件地址获取失败, 下载excel文件不成功")

    def remove_excel(self, remove_excel_url):
        '''
            需要如下参数:domain_id、pad_id、list_type、folder_id、xsrf
            '''
        domain_id = self.localPadId.split('$')[0]
        pad_id = self.localPadId.split('$')[1]
        list_type = 5
        folder_id = '/'
        xsrf = re.findall(re.compile(r'TOK=\S+;'),
                          cookie_value)[0].replace('TOK=', '').replace(';', '')
        body = {'domain_id': domain_id, 'pad_id': pad_id, 'list_type': list_type, 'folder_id': folder_id, 'xsrf': xsrf}
        headers_only = {'cookie': self.headers['Cookie']}
        res1 = requests.post(url=remove_excel_url, headers=headers_only,
                             data=body, verify=False)

        if res1.status_code == 200:
            print(res1.json(), '\n', '将在线excel文档移动到回收站完成')
        else:
            print('将在线excel文档移动到回收站失败')

    def update_excel(self, new_file, old_file):
        '''
        将old_file内容按照一定规则更新到new_file
        :param new_file: 在线文档xlsx
        :param old_file: 检查结果文档xlsx
        :return: True/False
        '''
        # new_file
        new_wb = load_workbook(filename=new_file, read_only=True, data_only=True)
        new_ws1 = new_wb[new_wb.sheetnames[0]]  # 问题列表
        new_row_max = new_ws1.max_row  # 最大行数
        new_word = new_ws1.rows
        new_row_num = 0

        # old_file
        old_wb = load_workbook(filename=new_file, read_only=True, data_only=True)
        old_ws1 = old_wb[old_wb.sheetnames[0]]  # 问题列表
        old_row_max = old_ws1.max_row  # 最大行数
        old_word = old_ws1.rows
        old_row_num = 0

        if True:
            return True
        else:
            return False

    def import_secretKey(self, secretkey_url, filename):
        '''
        获取数据
        :param secretkey_url:
        :param filename: 需要导入的excel
        :return:
        '''
        with open(filename, 'rb') as file:
            md5_obj = hashlib.md5()
            md5_obj.update(file.read())
            file_hash = md5_obj.hexdigest()
        file_size = os.path.getsize(filename)
        body = {
            'Files': [{"FileName": filename, "FileMD5": file_hash, "FileSize": file_size}]
        }
        # print(body)
        headers_only = {'cookie': self.headers['Cookie']}
        res_sk = requests.post(secretkey_url, headers=headers_only,
                               json=body, verify=False)
        # print(res_sk.status_code)
        if res_sk.status_code == 200:
            # print(res_sk.json())
            return jp.jsonpath(res_sk.json(), '$..CosPutUrl')[0].split('?')[0]
        else:
            print('导入文件失败')
            sys.exit()

    def import_excel(self, import_excel_url):
        '''
        将本地excel文件上传导入到在线文档
        :param import_excel_url:
        :param filename: 本地excel文件绝对路径
        :return: True/False
        '''
        headers_only = {'cookie': self.headers['Cookie']}
        res_imp = requests.get(import_excel_url, headers=headers_only,verify=False)
        print(res_imp.json())
        if res_imp.status_code == 200:
            print(f'==请求成功{import_excel_url}')
        else:
            print('==requests.get({0}) is failed'.format(import_excel_url))


if __name__ == '__main__':
    # 1.下载在线文档为xlsx文件
    # excel文档地址
    document_url = 'https://docs.qq.com/sheet/DWVl3elpDT2tocHpj'
    # 此值每一份腾讯文档有一个,需要手动获取
    local_pad_id = '300000000$YROuGfjifCgh'
    # 打开腾讯文档后,从抓到的接口中获取cookie信息
    cookie_value = 'fingerprint=4a40b1c76236492ea6cef2bee8bbc95b33; optimal_cdn_domain=docs2.gtimg.com; RK=hUXA7JsMWq; ptcz=68f0264ee50556b1e9a3643b2b454fd7cb9c269ae49ba606f14196c435a13ab4; low_login_enable=1; wx_appid=wxd45c635d754dbf59; openid=oDzL40MvaXobdiWhfsa3GyAOWaeg; access_token=68_XZQXjCRnw_xS3HtDJ1lpJnqLjzSIDa6blM9jkOuFpXgJG2DbffQp4AziabPX8gt8lc9IYIsSwueyrQSayaUqoMRht5d_Ze3c6LvLTIM7mxw; refresh_token=68_iHLvLX0KltgwF9NmXGB56W6ZnQoH_l8cJpuDiOWKgAwea1OkN2u473vJxPOGJ8cgA91vaRq0Ck4uveDYW8Gwxf33khoxiihnD-6YyazU1Co; DOC_SID=eae9e089259c48019074fc7dbd82e739941b87b3c27b4db888dda6887a28c914; SID=eae9e089259c48019074fc7dbd82e739941b87b3c27b4db888dda6887a28c914; loginTime=1684976719137; traceid=9cb4fad56c; TOK=9cb4fad56c096902; hashkey=9cb4fad5'
    tx = TengXunDocument(document_url, local_pad_id, cookie_value)
    now_user_index = tx.get_now_user_index()
    # 导出文件任务url
    export_excel_url = f'https://docs.qq.com/v1/export/export_office?u={now_user_index}'
    # 获取导出任务的操作id
    operation_id = tx.export_excel_task(export_excel_url)
    check_progress_url = f'https://docs.qq.com/v1/export/query_progress?u={now_user_index}&operationId={operation_id}'
    # current_datetime = datetime.strftime(datetime.now(), '%Y_%m_%d_%H_%M_%S')
    file_name = f'测试表格001.xlsx'
    tx.download_excel(check_progress_url, file_name)

    # 2.删除在线文档
    doc_delete_url = f'https://docs.qq.com/cgi-bin/online_docs/doc_delete?u={now_user_index}'
    tx.remove_excel(doc_delete_url)

    # 3.编辑文档


    # 4.重新导入文件
    secretkey_url = f'https://docs.qq.com/v1/import/secretKey?u={now_user_index}'
    print(secretkey_url)
    file_name = f'测试表格.xlsx'
    CosPutUrl = tx.import_secretKey(secretkey_url, os.path.join(os.getcwd(), file_name))
    print(CosPutUrl)
    # import_operation_id = f'1686647785203_{file_hash}'
    # import_excel_url = f'https://docs.qq.com/v1/import/queryImportProgress?u={now_user_index}&operationId={import_operation_id}'
    # print(import_excel_url)
    # tx.import_excel(import_excel_url)
    # print(CosPutUrl)