一、选题背景介绍及选题意义

随着经济的发展和城市化进程的加快，房地产市场已成为我国经济增长的重要支撑。然而，房价的波动却成为了社会关注的焦点之一。因此，利用大数据分析方法对房价进行研究和预测已成为当前热门的研究方向。大数据分析技术可以从多维度、全方位地收集和分析相关数据，发现数据背后的规律，并进行预测和决策。在大数据时代，各个领域都在积极应用大数据分析技术，房地产领域也不例外。通过大数据分析，可以从各个角度深入探索影响房价的因素，如基础设施、经济发展、政策环境、人口流动、土地供给等因素，同时还可发现各种数据之间的关系和趋势，以便更好地预测未来的房价变化。因此，大数据分析已经成为了房地产市场的重要工具之一，对于投资者、房地产开发商、政府部门等有着重要的实践意义。

二、大数据分析方案

步骤1：数据预处理和清洗

导入数据集并检查数据的完整性和一致性。
处理缺失值、异常值和重复值，确保数据的准确性和可靠性。

步骤2：数据探索性分析

对数据集进行统计描述和可视化分析，绘制相关图表（矩阵图、箱线图、散点图等）。
探索各个特征之间的关联性，通过相关系数矩阵、热力图等方法分析特征之间的相关程度。

三、数据分析步骤

1.数据源安居客，具体网址来源南京历史房价走势图，南京历史房价数据查询，南京近几年房价走势图-安居客 (anjuke.com)

2.数据清洗

 1 import re
 2 import csv
 3 
 4 """
 5 1、读入数据.2、清理数据.3、写出数据.
 6 """
 7 filename = "data_file\\ershoufang-mini-utf8.csv"
 8 with open(filename, encoding="utf-8") as f:
 9 reader = csv.reader(f)
10 context = [line for line in reader]
11 
12 with open("data_file\\ershoufang-mini-utf8.txt", "w", encoding="utf-8", newline="") as f:
13 writer = csv.writer(f)
14 for line in context:
15 line = [x.strip() for x in line] # 去除每个数据项的空白符和换行符
16 if line[0] == "id":
17 writer.writerow(line)
18 continue
19 
20 # 将杂乱的记录的数据项对齐
21 if "别墅" in line:
22 line_copy = line[:]
23 line[8] = "null"
24 line[9] = line_copy[8]
25 line[10] = "null"
26 line[11] = line_copy[9]
27 line[12] = line_copy[10]
28 line[13] = line_copy[11]
29 line[14] = "null"
30 line[15] = "null"
31 line[16] = line_copy[13]
32 if "商业办公类" in line:
33 # 正则表达式匹配
34 result = re.match(r"\d{4}-\d{1,2}-\d{1,2}", line[17])
35 if result is None:
36 del line[17]
37 result = re.match(r"\d{4}-\d{1,2}-\d{1,2}", line[17])
38 if result is None:
39 del line[17]
40 result = re.match(r"\d{4}-\d{1,2}-\d{1,2}", line[17])
41 if result is None:
42 del line[17]
43 if "车库" in line:
44 line_copy = line[:]
45 line[5] = "null"
46 line[6] = line_copy[5]
47 line[7] = "null"
48 line[11] = line_copy[7]
49 
50 try:
51 # 将总价数据项统一整理为整数
52 float_num = float(line[3])
53 line[3] = str(int(float_num))
54 
55 # 去除单价数据项单位
56 line[4] = line[4].split("元")[0]
57 
58 # 去除建筑面积数据项的单位
59 if line[7] != "null" and line[7] != "暂无数据":
60 line[7] = line[7].split("㎡")[0]
61 
62 # 去除套内面积数据项的单位
63 if line[9] != "null" and line[9] != "暂无数据":
64 line[9] = line[9].split("㎡")[0]
65 
66 writer.writerow(line)
67 except Exception as e:
68 print("数据项转换失败!该记录未写入")
69 
70 数据分析
71

 1 各区域二手房平均单价
 2 groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"])
 3 mean_unitprice = groups_unitprice_area.mean()
 4 mean_unitprice.index.name = ""
 5 
 6 fig = plt.figure(figsize=(12,7))
 7 ax = fig.add_subplot(111)
 8 ax.set_ylabel("单价(元/平米)",fontsize=14)
 9 ax.set_title("南京各区域二手房平均单价",fontsize=18)
10 mean_unitprice.plot(kind="bar",fontsize=12)
11 #plt.savefig('data_ana\\picture\\mean_price.jpg')
12 #plt.show()

 1 1 各区域二手房单价箱线图
 2  2 
 3  3 box_unitprice_area = df["unitPriceValue"].groupby(df["areaName"])
 4  4 flag = True
 5  5 box_data = pd.DataFrame(list(range(21000)),columns=["start"])
 6  6 for name,group in box_unitprice_area:
 7  7 box_data[name] = group
 8  8 del box_data["start"]
 9  9 #mean_unitprice.index.name = ""
10 10 
11 11 fig = plt.figure(figsize=(12,7))
12 12 ax = fig.add_subplot(111)
13 13 ax.set_ylabel("总价(万元)",fontsize=14)
14 14 ax.set_title("南京各区域二手房单价箱线图",fontsize=18)
15 15 box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[20000,30000,40000,50000,100000]）

 1 各区域二手房总价箱线图
 2 
 3 box_total_area = df["total"].groupby(df["areaName"])
 4 flag = True
 5 box_data = pd.DataFrame(list(range(21000)),columns=["start"])
 6 for name,group in box_total_area:
 7 box_data[name] = group
 8 del box_data["start"]
 9 #mean_unitprice.index.name = ""
10 
11 fig = plt.figure(figsize=(12,7))
12 ax = fig.add_subplot(111)
13 ax.set_ylabel("总价(万元)",fontsize=14)
14 ax.set_title("南京各区域二手房总价箱线图",fontsize=18)
15 box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[0,200,500,1000,2000,3000],ylim=[0,2100]）

 1 各区域二手房平均建筑面积
 2 
 3 groups_area_jzmj = df["jzmj"].groupby(df["areaName"])
 4 mean_jzmj = groups_area_jzmj.mean()
 5 mean_jzmj.index.name = ""
 6 
 7 #数据可视化
 8 fig = plt.figure(figsize=(12,7))
 9 ax = fig.add_subplot(111)
10 ax.set_ylabel("建筑面积(㎡)",fontsize=14)
11 ax.set_title("南京各区域二手房平均建筑面积",fontsize=18)
12 mean_jzmj.plot(kind="bar",fontsize=12)

各区域平均单价和平均建筑面积

1 """南京二手房单价与建筑面积散点图"""
2 fig = plt.figure(figsize=(12,7))
3 ax = fig.add_subplot(111)
4 ax.set_title("南京二手房单价与建筑面积散点图",fontsize=18)
5 df.plot(x="jzmj",y="unitPriceValue",kind="scatter",grid=True,fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500,600,700],xlim=[0,800])
6 ax.set_xlabel("建筑面积(㎡)",fontsize=14)
7 ax.set_ylabel("单价(元/平米)",fontsize=14)
8 plt.show()

 1 """南京各区域二手房总价箱线图"""
 2 #数据分组、数据运算和聚合
 3 box_total_area = df["total"].groupby(df["areaName"])
 4 flag = True
 5 box_data = pd.DataFrame(list(range(21000)),columns=["start"])
 6 for name,group in box_total_area:
 7     box_data[name] = group
 8 del box_data["start"]
 9 #mean_unitprice.index.name = ""
10 
11 fig = plt.figure(figsize=(12,7))
12 ax = fig.add_subplot(111)
13 ax.set_ylabel("总价(万元)",fontsize=14)
14 ax.set_title("南京各区域二手房总价箱线图",fontsize=18)
15 box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[0,200,500,1000,2000,3000],ylim=[0,2100])
16 plt.show()

 1 groups_area = df["id"].groupby(df["areaName"])
 2 count_area = groups_area.count()
 3 count_area.index.name = ""
 4 
 5 fig = plt.figure(figsize=(12,7))
 6 ax = fig.add_subplot(111)
 7 ax.set_ylabel("房源数量(套)",fontsize=14)
 8 ax.set_title("南京各区域二手房房源数量",fontsize=18)
 9 count_area.sort_values().plot(kind="line",fontsize=12,grid=True,marker="o")
10 plt.show()

 1 import seaborn as sns
 2 
 3 # 计算南京各区域二手房的平均单价
 4 mean_unitprice_area = df.groupby('areaName')['unitPriceValue'].mean().reset_index()
 5 
 6 # 使用pivot_table函数转换数据格式
 7 heatmap_data = mean_unitprice_area.pivot_table(index='areaName', columns='areaName', values='unitPriceValue', aggfunc='mean')
 8 
 9 # 绘制热力图
10 plt.figure(figsize=(10, 8))
11 sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt=".0f", linewidths=.5)
12 plt.title('南京各区域二手房平均单价热力图')
13 plt.xlabel('区域')
14 plt.ylabel('区域')
15 plt.xticks(rotation=45)
16 plt.yticks(rotation=0)
17 plt.show()

总代码

  1 import numpy as np
  2 import matplotlib.pyplot as plt
  3 import pandas as pd
  4 
  5 import re
  6 import csv
  7 
  8 """
  9 1、读入数据.2、清理数据.3、写出数据.
 10 """
 11 filename = "data_file\\ershoufang-mini-utf8.csv"
 12 with open(filename, encoding="utf-8") as f:
 13     reader = csv.reader(f)
 14     context = [line for line in reader]
 15 
 16 with open("data_file\\ershoufang-mini-utf8.txt", "w", encoding="utf-8", newline="") as f:
 17     writer = csv.writer(f)
 18     for line in context:
 19         line = [x.strip() for x in line]  # 去除每个数据项的空白符和换行符
 20         if line[0] == "id":
 21             writer.writerow(line)
 22             continue
 23 
 24         # 将杂乱的记录的数据项对齐
 25         if "别墅" in line:
 26             line_copy = line[:]
 27             line[8] = "null"
 28             line[9] = line_copy[8]
 29             line[10] = "null"
 30             line[11] = line_copy[9]
 31             line[12] = line_copy[10]
 32             line[13] = line_copy[11]
 33             line[14] = "null"
 34             line[15] = "null"
 35             line[16] = line_copy[13]
 36         if "商业办公类" in line:
 37             # 正则表达式匹配
 38             result = re.match(r"\d{4}-\d{1,2}-\d{1,2}", line[17])
 39             if result is None:
 40                 del line[17]
 41             result = re.match(r"\d{4}-\d{1,2}-\d{1,2}", line[17])
 42             if result is None:
 43                 del line[17]
 44             result = re.match(r"\d{4}-\d{1,2}-\d{1,2}", line[17])
 45             if result is None:
 46                 del line[17]
 47         if "车库" in line:
 48             line_copy = line[:]
 49             line[5] = "null"
 50             line[6] = line_copy[5]
 51             line[7] = "null"
 52             line[11] = line_copy[7]
 53 
 54         try:
 55             # 将总价数据项统一整理为整数
 56             float_num = float(line[3])
 57             line[3] = str(int(float_num))
 58 
 59             # 去除单价数据项单位
 60             line[4] = line[4].split("元")[0]
 61 
 62             # 去除建筑面积数据项的单位
 63             if line[7] != "null" and line[7] != "暂无数据":
 64                 line[7] = line[7].split("㎡")[0]
 65 
 66             # 去除套内面积数据项的单位
 67             if line[9] != "null" and line[9] != "暂无数据":
 68                 line[9] = line[9].split("㎡")[0]
 69 
 70             writer.writerow(line)
 71         except Exception as e:
 72             print("数据项转换失败!该记录未写入")
 73 
 74 
 75 
 76 
 77 #用来正常显示中文标签
 78 plt.rcParams['font.sans-serif'] = ['SimHei']
 79 #用来正常显示负号
 80 plt.rcParams['axes.unicode_minus'] = False
 81 
 82 """1、数据加载"""
 83 #定义加载数据的文件名
 84 #filename = "data_file\\ershoufang-mini-utf8.csv"
 85 filename = "data_file\\ershoufang-clean-utf8-v1.1.csv"
 86 #自定义数据的行列索引（行索引使用pd默认的，列索引使用自定义的）
 87 names = [
 88         "id","communityName","areaName","total","unitPriceValue",
 89         "fwhx","szlc","jzmj","hxjg","tnmj",
 90         "jzlx","fwcx","jzjg","zxqk","thbl",
 91         "pbdt","cqnx","gpsj","jyqs","scjy",
 92         "fwyt","fwnx","cqss","dyxx","fbbj",
 93         ]
 94 #自定义需要处理的缺失值标记列表
 95 miss_value = ["null","暂无数据"]
 96 #数据类型会自动转换
 97 #使用自定义的列名，跳过文件中的头行，处理缺失值列表标记的缺失值
 98 df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value)
 99 #print(df.info())
100 
101 """2、数据运算"""
102 """3、数据可视化呈现"""
103 
104 
105 """南京各区域二手房平均单价"""
106 #数据分组、数据运算和聚合
107 groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"])
108 mean_unitprice = groups_unitprice_area.mean()
109 mean_unitprice.index.name = ""
110 
111 fig = plt.figure(figsize=(12,7))
112 ax = fig.add_subplot(111)
113 ax.set_ylabel("单价(元/平米)",fontsize=14)
114 ax.set_title("南京各区域二手房平均单价",fontsize=18)
115 mean_unitprice.plot(kind="bar",fontsize=12)
116 #plt.savefig('data_ana\\picture\\mean_price.jpg')
117 #plt.show()
118 
119 
120 """南京各区域二手房单价箱线图"""
121 #数据分组、数据运算和聚合
122 box_unitprice_area = df["unitPriceValue"].groupby(df["areaName"])
123 flag = True
124 box_data = pd.DataFrame(list(range(21000)),columns=["start"])
125 for name,group in box_unitprice_area:
126     box_data[name] = group
127 del box_data["start"]
128 #mean_unitprice.index.name = ""
129 
130 fig = plt.figure(figsize=(12,7))
131 ax = fig.add_subplot(111)
132 ax.set_ylabel("总价(万元)",fontsize=14)
133 ax.set_title("南京各区域二手房单价箱线图",fontsize=18)
134 box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[20000,30000,40000,50000,100000])
135 
136 
137 """南京各区域二手房总价箱线图"""
138 #数据分组、数据运算和聚合
139 box_total_area = df["total"].groupby(df["areaName"])
140 flag = True
141 box_data = pd.DataFrame(list(range(21000)),columns=["start"])
142 for name,group in box_total_area:
143     box_data[name] = group
144 del box_data["start"]
145 #mean_unitprice.index.name = ""
146 
147 fig = plt.figure(figsize=(12,7))
148 ax = fig.add_subplot(111)
149 ax.set_ylabel("总价(万元)",fontsize=14)
150 ax.set_title("南京各区域二手房总价箱线图",fontsize=18)
151 box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[0,200,500,1000,2000,3000],ylim=[0,2100])
152 
153 
154 """南京各区域二手房平均建筑面积"""
155 #数据运算
156 groups_area_jzmj = df["jzmj"].groupby(df["areaName"])
157 mean_jzmj = groups_area_jzmj.mean()
158 mean_jzmj.index.name = ""
159 
160 #数据可视化
161 fig = plt.figure(figsize=(12,7))
162 ax = fig.add_subplot(111)
163 ax.set_ylabel("建筑面积(㎡)",fontsize=14)
164 ax.set_title("南京各区域二手房平均建筑面积",fontsize=18)
165 mean_jzmj.plot(kind="bar",fontsize=12)
166 
167 
168 """南京各区域平均单价和平均建筑面积"""
169 groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"])
170 mean_unitprice = groups_unitprice_area.mean()
171 mean_unitprice.index.name = ""
172 
173 groups_area_jzmj = df["jzmj"].groupby(df["areaName"])
174 mean_jzmj = groups_area_jzmj.mean()
175 mean_jzmj.index.name = ""
176 
177 fig = plt.figure()
178 ax1 = fig.add_subplot(2,1,1)
179 ax1.set_ylabel("单价(元/平米)",fontsize=14)
180 ax1.set_title("南京各区域二手房平均单价",fontsize=18)
181 ax2 = fig.add_subplot(2,1,2)
182 ax2.set_ylabel("建筑面积(㎡)",fontsize=14)
183 ax2.set_title("南京各区域二手房平均建筑面积",fontsize=18)
184 plt.subplots_adjust(hspace=0.4)
185 
186 mean_unitprice.plot(kind="bar",ax=ax1,fontsize=12)
187 mean_jzmj.plot(kind="bar",ax=ax2,fontsize=12)
188 
189 
190 """南京各区域二手房房源数量"""
191 groups_area = df["id"].groupby(df["areaName"])
192 count_area = groups_area.count()
193 count_area.index.name = ""
194 
195 fig = plt.figure(figsize=(12,7))
196 ax = fig.add_subplot(111)
197 ax.set_ylabel("房源数量(套)",fontsize=14)
198 ax.set_title("南京各区域二手房房源数量",fontsize=18)
199 count_area.sort_values().plot(kind="line",fontsize=12,grid=True,marker="o")
200 
201 
202 """南京二手房单价最高Top20"""
203 unitprice_top = df.sort_values(by="unitPriceValue",ascending=False)[:20]
204 unitprice_top = unitprice_top.sort_values(by="unitPriceValue")
205 unitprice_top.set_index(unitprice_top["communityName"],inplace=True)
206 unitprice_top.index.name = ""
207 
208 fig = plt.figure(figsize=(12,7))
209 ax = fig.add_subplot(111)
210 ax.set_ylabel("单价(元/平米)",fontsize=14)
211 ax.set_title("南京二手房单价最高Top20",fontsize=18)
212 unitprice_top["unitPriceValue"].plot(kind="barh",fontsize=12)
213 
214 
215 """南京二手房总价与建筑面积散点图"""
216 fig = plt.figure(figsize=(12,7))
217 ax = fig.add_subplot(111)
218 ax.set_title("南京二手房总价与建筑面积散点图",fontsize=18)
219 df.plot(x="jzmj", y="total", kind="scatter",fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500,600,700],xlim=[0,800])
220 ax.set_xlabel("建筑面积(㎡)",fontsize=14)
221 ax.set_ylabel("总价(万元)",fontsize=14)
222 
223 """南京二手房单价与建筑面积散点图"""
224 fig = plt.figure(figsize=(12,7))
225 ax = fig.add_subplot(111)
226 ax.set_title("南京二手房单价与建筑面积散点图",fontsize=18)
227 df.plot(x="jzmj",y="unitPriceValue",kind="scatter",grid=True,fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500,600,700],xlim=[0,800])
228 ax.set_xlabel("建筑面积(㎡)",fontsize=14)
229 ax.set_ylabel("单价(元/平米)",fontsize=14)
230 
231 
232 
233 
234 from wordcloud import WordCloud
235 import jieba
236 from scipy.misc import imread
237 
238 """南京二手房数据词云"""
239 # 基础配置数据
240 filename = "data_file\\ershoufang-clean-utf8-v1.1.csv"
241 backpicture = "resources\\house2.jpg"
242 savepicture = "data_ana\\picture\\南京二手房数据词云2.png"
243 fontpath = "resources\\simhei.ttf"
244 stopwords = ["null", "暂无", "数据", "上传", "照片", "房本"]
245 
246 # 读入数据文件
247 comment_text = open(filename, encoding="utf-8").read()
248 # 读取背景图片
249 color_mask = imread(backpicture)
250 
251 # 结巴分词,同时剔除掉不需要的词汇
252 ershoufang_words = jieba.cut(comment_text)
253 ershoufang_words = [word for word in ershoufang_words if word not in stopwords]
254 cut_text = " ".join(ershoufang_words)
255 
256 # 设置词云格式
257 cloud = WordCloud(
258     # 设置字体，不指定就会出现乱码
259     font_path=fontpath,
260     # 设置背景色
261     background_color='white',
262     # 词云形状
263     mask=color_mask,
264     # 允许最大词汇
265     max_words=2000,
266     # 最大号字体
267     max_font_size=60
268 )
269 # 产生词云
270 word_cloud = cloud.generate(cut_text)
271 # 保存图片
272 word_cloud.to_file(savepicture)