# 由于我只能访问hugginface网站，但是不能下载里面的数据，所以编写下面的代码，获取从huggingface下载数据的链接。在从其它路径下载数据。-526互联

# 由于我只能访问hugginface网站，但是不能下载里面的数据，所以编写下面的代码，获取从huggingface下载数据的链接。在从其它路径下载数据。

# 获取huggingface某个模型所有要下载数据的命令行。

# 可以把结果复制到autodl里，进行执行。速度可以达到13M/s

# 然后在autodl里进行训练推理，或者拷贝出来就可以了。

import requests

from bs4 import BeautifulSoup

model = "Qwen/Qwen-14B-Chat-Int4" #"Qwen/Qwen-7B-Chat-Int4" #"Qwen/Qwen-7B-Chat"

url = 'https://huggingface.co/'+model+'/tree/main' # 替换为要分析的网页URL https://huggingface.co/gpt2/tree/main/onnx

startString = "/"+model+"/resolve"

pathString = "/"+model+"/tree/main/"

def get_last_part_before_slash(s):

# 寻找最后一个斜杠的位置

last_slash_index = s.rfind('/')

# 如果找到斜杠，返回斜杠前面的所有字符

if last_slash_index != -1:

return s[:last_slash_index]

# 如果没有找到斜杠，返回原始字符串

else:

return s

def getFileListInWebPage(url,startString,pathString):

# 发送GET请求获取网页内容

response = requests.get(url)

# 使用BeautifulSoup解析网页内容

soup = BeautifulSoup(response.text, 'html.parser')

# 查找所有a标签，并提取href属性值

download_links = []

for link in soup.find_all('a'):

href = link.get('href')

# print(href)

if href and (href.startswith(startString) ): # 只保留以xx字母开头的下载链接

download_links.append(href)

elif href and (href.startswith(pathString)):

url1 = 'https://huggingface.co'+href

getFileListInWebPage(url1,startString,pathString)

# 打印所有可下载文件路径

for link in download_links:

fileName = link.split('/')[-1]

# print("powershell -Command Invoke-WebRequest -Uri \"https://huggingface.co"+link+"\" -OutFile \""+fileName+"\"")

# sPath = link.split(pathString)[-1].split('/')

sFilePath = get_last_part_before_slash(link.split(pathString)[-1])

print("wget -c \"https://huggingface.co"+link+"\" -P /root/autodl-tmp"+sFilePath)

getFileListInWebPage(url,startString,pathString)

数据huggingface hugginface路径

huggingface数据

使用说明huggingface数据arrow

huggingface标记数据bart

huggingface dataset代码数据