all+temp

发布时间 2023-09-08 18:11:00作者: $KAMISAMALZ

 

 

All

import requests
import re
from bs4 import BeautifulSoup
import csv

# 发起GET请求获取网页源码
#url = 'https://www.blackview.hk/'

# 提示用户输入网址,并进行格式验证
while True:
    url = input("请输入网址: ")
    pattern = r'^https?://[\w\-]+(\.[\w\-]+)+[/#?]?.*$'  # 简单的网址验证正则表达式
    if re.match(pattern, url):
        print("正在获取数据中......")
        break
    else:
        print("请输入有效的网址!")

response = requests.get(url)
html = response.text

# 使用正则表达式提取所有符合条件的链接
pattern = r'<a\s+(?:[^>]*?\s+)?href="/products/(\d+)"'
links = re.findall(pattern, html)

# 去重链接
unique_links = list(set(links))

# 将链接写入文件
file_path = 'F:/price.txt'
with open(file_path, 'w', encoding='utf-8') as file:
    for link in unique_links:
        file.write(f"{url}products/{link}\n")

#print('链接已保存到', file_path)
#print('3...')
# 从文件中读取链接列表
url_list = []
file_path = 'F:\\price.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # 过滤空行和空字符
            url_list.append(line)

#写空文件
data = ""
with open(file_path, "w") as file:
    file.write(data)
#print("写空price.txt文件成功")
#print("请等待......!")

###这里是获取到各大类产品链接并且存储到url_list列表里面去了


    
# 循环爬取产品型号和价格并写入文件
for url in url_list:
    # 发送GET请求获取网页内容
    response = requests.get(url)
    html = response.text

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html, 'html.parser')

    # 找到class为"goods-list"的标签
    goods_list = soup.find(class_='goods-list')

#--------这里插入获取各分类产品a链接代码--------#
# 获取标签下所有a标签的链接并去重
    links = list(set(a['href'] for a in goods_list.find_all('a') if '/products/item' in a['href']))

# 计算标签数量
    num_links = len(links)

# 将标签数量和链接写入文件
    with open('F:\\output.txt', 'a', encoding='utf-8') as file:
    #file.write("标签数量: " + str(num_links) + "\n")
        file.write('\n'.join(links))
        file.write('\n')  # 添加换行符以保持每次写入的内容独立
    #print("链接已写入文件")
#----------------------#

    # 获取标签下的所有内容
    content = goods_list.get_text()

    # 删除不包含数字的行
    lines = content.split('\n')
    filtered_lines = [line for line in lines if any(char.isnumeric() for char in line)]

    # 将内容写入文件
    with open('F:\\price.txt', 'a', encoding='utf-8') as file:
        file.write('\n'.join(filtered_lines))
        file.write('\n\n')  # 写入换行符,分隔不同链接的内容

#print("产品类别与价格已写入price.txt文件")
print("链接已写入文件")

output_data = []  # 用于存储要导出的数据

with open("F:\\price.txt", "r") as file:
    current_group = []  # 当前组的数据列表

    for line in file:
        if line.strip():  # 如果行内容不为空,则将其添加到当前组数据列表中
            current_group.append(line.strip())
        else:
            output_data.append(current_group)  # 将当前组添加到输出数据中
            current_group = []  # 创建新的当前组数据列表

    if current_group:  # 处理最后一组数据
        output_data.append(current_group)

# 导出为CSV文件
with open("F:\\price.csv", "w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    max_size = max(len(group) for group in output_data)  # 获取最大组长度,即列数
    for i in range(max_size):
        row = []
        for group in output_data:
            if i < len(group):
                row.append(group[i])
            else:
                row.append("")  # 如果该组不存在该列,则添加空字符串
        writer.writerow(row)

print("导出产品型号与价格.csv文件成功!")

url_1 = "https://www.blackview.hk"

with open("output.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

new_lines = [url_1 + line.strip() for line in lines]

with open("output.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(new_lines))

print("URL地址已成功添加到每一行之前!")


#----------获取每一个产品的具体信息
# 从文件中读取所有产品链接列表
url_list_all = []
file_path = 'F:\\output.txt'
file_path_temp = 'F:\\temp.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # 过滤空行和空字符
            url_list_all.append(line)

#写空
data = ""
cnt = 1
with open(file_path, "w") as file:
    file.write(data)
    
for url in url_list_all:
    # 发送GET请求获取网页内容
    response = requests.get(url)
    html = response.text

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html, 'html.parser')

    # 找到class为"left""right"或者"li-tit""li-msg"的标签
    tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg"])

    # 提取标签内容,并去除首尾的空字符或换行
    content = [tag.get_text(strip=True) for tag in tags]

    # 将内容写入文件
    with open(file_path_temp, 'w', encoding='utf-8') as file:
        file.write('\n'.join(content))

    # 读取文件内容,并找到包含"Model"字符串的行的索引
    with open(file_path_temp, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    model_line_index = -1
    for i, line in enumerate(lines):
        if 'Model' in line:
            i = i -1
            model_line_index = i
            break

    # 如果找到了包含"Model"字符串的行,则将该行及其之前的内容删除
    if model_line_index >= 0:
        lines = lines[model_line_index+1:]

    # 将处理后的内容写回文件
    with open(file_path_temp, 'w', encoding='utf-8') as file:
        file.writelines(lines)

    with open(file_path_temp, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    with open(file_path, 'a', encoding='utf-8') as file:
        file.writelines(lines)
        file.write('\n\n')  # 写入换行符
    print(f"产品{cnt}已写入output.txt文件")
    # 递增计数器
    cnt += 1

 

temp

 

import requests
import re
from bs4 import BeautifulSoup
import csv

# 从文件中读取所有产品链接列表
url_list_all = []
file_path = 'F:\\output.txt'
file_path_temp = 'F:\\temp.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # 过滤空行和空字符
            url_list_all.append(line)

#写空
data = ""
cnt = 1
with open(file_path, "w") as file:
    file.write(data)
    
for url in url_list_all:
    # 发送GET请求获取网页内容
    response = requests.get(url)
    html = response.text

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html, 'html.parser')

    # 找到class为"left""right"或者"li-tit""li-msg"的标签
    tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg"])

    # 提取标签内容,并去除首尾的空字符或换行
    content = [tag.get_text(strip=True) for tag in tags]

    # 将内容写入文件
    with open(file_path_temp, 'w', encoding='utf-8') as file:
        file.write('\n'.join(content))

    # 读取文件内容,并找到包含"Model"字符串的行的索引
    with open(file_path_temp, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    model_line_index = -1
    for i, line in enumerate(lines):
        if 'Model' in line:
            i = i -1
            model_line_index = i
            break

    # 如果找到了包含"Model"字符串的行,则将该行及其之前的内容删除
    if model_line_index >= 0:
        lines = lines[model_line_index+1:]

    # 将处理后的内容写回文件
    with open(file_path_temp, 'w', encoding='utf-8') as file:
        file.writelines(lines)

    with open(file_path_temp, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    with open(file_path, 'a', encoding='utf-8') as file:
        file.writelines(lines)
        file.write('\n\n')  # 写入换行符
    print(f"产品{cnt}已写入output.txt文件")
    # 递增计数器
    cnt += 1