All
import requests import re from bs4 import BeautifulSoup import csv # 发起GET请求获取网页源码 #url = 'https://www.blackview.hk/' # 提示用户输入网址,并进行格式验证 while True: url = input("请输入网址: ") pattern = r'^https?://[\w\-]+(\.[\w\-]+)+[/#?]?.*$' # 简单的网址验证正则表达式 if re.match(pattern, url): print("正在获取数据中......") break else: print("请输入有效的网址!") response = requests.get(url) html = response.text # 使用正则表达式提取所有符合条件的链接 pattern = r'<a\s+(?:[^>]*?\s+)?href="/products/(\d+)"' links = re.findall(pattern, html) # 去重链接 unique_links = list(set(links)) # 将链接写入文件 file_path = 'F:/price.txt' with open(file_path, 'w', encoding='utf-8') as file: for link in unique_links: file.write(f"{url}products/{link}\n") #print('链接已保存到', file_path) #print('3...') # 从文件中读取链接列表 url_list = [] file_path = 'F:\\price.txt' with open(file_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() if line: # 过滤空行和空字符 url_list.append(line) #写空文件 data = "" with open(file_path, "w") as file: file.write(data) #print("写空price.txt文件成功") #print("请等待......!") ###这里是获取到各大类产品链接并且存储到url_list列表里面去了 # 循环爬取产品型号和价格并写入文件 for url in url_list: # 发送GET请求获取网页内容 response = requests.get(url) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 找到class为"goods-list"的标签 goods_list = soup.find(class_='goods-list') #--------这里插入获取各分类产品a链接代码--------# # 获取标签下所有a标签的链接并去重 links = list(set(a['href'] for a in goods_list.find_all('a') if '/products/item' in a['href'])) # 计算标签数量 num_links = len(links) # 将标签数量和链接写入文件 with open('F:\\output.txt', 'a', encoding='utf-8') as file: #file.write("标签数量: " + str(num_links) + "\n") file.write('\n'.join(links)) file.write('\n') # 添加换行符以保持每次写入的内容独立 #print("链接已写入文件") #----------------------# # 获取标签下的所有内容 content = goods_list.get_text() # 删除不包含数字的行 lines = content.split('\n') filtered_lines = [line for line in lines if any(char.isnumeric() for char in line)] # 将内容写入文件 with open('F:\\price.txt', 'a', encoding='utf-8') as file: file.write('\n'.join(filtered_lines)) file.write('\n\n') # 写入换行符,分隔不同链接的内容 #print("产品类别与价格已写入price.txt文件") print("链接已写入文件") output_data = [] # 用于存储要导出的数据 with open("F:\\price.txt", "r") as file: current_group = [] # 当前组的数据列表 for line in file: if line.strip(): # 如果行内容不为空,则将其添加到当前组数据列表中 current_group.append(line.strip()) else: output_data.append(current_group) # 将当前组添加到输出数据中 current_group = [] # 创建新的当前组数据列表 if current_group: # 处理最后一组数据 output_data.append(current_group) # 导出为CSV文件 with open("F:\\price.csv", "w", newline="") as csv_file: writer = csv.writer(csv_file) max_size = max(len(group) for group in output_data) # 获取最大组长度,即列数 for i in range(max_size): row = [] for group in output_data: if i < len(group): row.append(group[i]) else: row.append("") # 如果该组不存在该列,则添加空字符串 writer.writerow(row) print("导出产品型号与价格.csv文件成功!") url_1 = "https://www.blackview.hk" with open("output.txt", "r", encoding="utf-8") as file: lines = file.readlines() new_lines = [url_1 + line.strip() for line in lines] with open("output.txt", "w", encoding="utf-8") as file: file.write("\n".join(new_lines)) print("URL地址已成功添加到每一行之前!") #----------获取每一个产品的具体信息 # 从文件中读取所有产品链接列表 url_list_all = [] file_path = 'F:\\output.txt' file_path_temp = 'F:\\temp.txt' with open(file_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() if line: # 过滤空行和空字符 url_list_all.append(line) #写空 data = "" cnt = 1 with open(file_path, "w") as file: file.write(data) for url in url_list_all: # 发送GET请求获取网页内容 response = requests.get(url) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 找到class为"left"、"right"或者"li-tit"、"li-msg"的标签 tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg"]) # 提取标签内容,并去除首尾的空字符或换行 content = [tag.get_text(strip=True) for tag in tags] # 将内容写入文件 with open(file_path_temp, 'w', encoding='utf-8') as file: file.write('\n'.join(content)) # 读取文件内容,并找到包含"Model"字符串的行的索引 with open(file_path_temp, 'r', encoding='utf-8') as file: lines = file.readlines() model_line_index = -1 for i, line in enumerate(lines): if 'Model' in line: i = i -1 model_line_index = i break # 如果找到了包含"Model"字符串的行,则将该行及其之前的内容删除 if model_line_index >= 0: lines = lines[model_line_index+1:] # 将处理后的内容写回文件 with open(file_path_temp, 'w', encoding='utf-8') as file: file.writelines(lines) with open(file_path_temp, 'r', encoding='utf-8') as file: lines = file.readlines() with open(file_path, 'a', encoding='utf-8') as file: file.writelines(lines) file.write('\n\n') # 写入换行符 print(f"产品{cnt}已写入output.txt文件") # 递增计数器 cnt += 1
temp
import requests import re from bs4 import BeautifulSoup import csv # 从文件中读取所有产品链接列表 url_list_all = [] file_path = 'F:\\output.txt' file_path_temp = 'F:\\temp.txt' with open(file_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() if line: # 过滤空行和空字符 url_list_all.append(line) #写空 data = "" cnt = 1 with open(file_path, "w") as file: file.write(data) for url in url_list_all: # 发送GET请求获取网页内容 response = requests.get(url) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 找到class为"left"、"right"或者"li-tit"、"li-msg"的标签 tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg"]) # 提取标签内容,并去除首尾的空字符或换行 content = [tag.get_text(strip=True) for tag in tags] # 将内容写入文件 with open(file_path_temp, 'w', encoding='utf-8') as file: file.write('\n'.join(content)) # 读取文件内容,并找到包含"Model"字符串的行的索引 with open(file_path_temp, 'r', encoding='utf-8') as file: lines = file.readlines() model_line_index = -1 for i, line in enumerate(lines): if 'Model' in line: i = i -1 model_line_index = i break # 如果找到了包含"Model"字符串的行,则将该行及其之前的内容删除 if model_line_index >= 0: lines = lines[model_line_index+1:] # 将处理后的内容写回文件 with open(file_path_temp, 'w', encoding='utf-8') as file: file.writelines(lines) with open(file_path_temp, 'r', encoding='utf-8') as file: lines = file.readlines() with open(file_path, 'a', encoding='utf-8') as file: file.writelines(lines) file.write('\n\n') # 写入换行符 print(f"产品{cnt}已写入output.txt文件") # 递增计数器 cnt += 1