使用json.dump(citys_data, f, ensure_ascii=False)写文件的时候,如果要写入汉字,则要指定ensure_ascii为False

发布时间 2023-09-12 10:52:12作者: 勋勋的大宝贝

这个代码例子为获取链家网里所有的城市,然后将按照{省名:{市名:url},{市名:url}....}的方式

import requests
from lxml import etree
import json


def get_all_city():
    url = "https://www.lianjia.com/city/"  # 全国城市列表
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    }
    try:
        response = requests.get(url=url, headers=headers)
    except requests.exceptions.ConnectionError as e: # 有时网不好,连接错误
        print(e)  # HTTPSConnectionPool(host='www.lianjia.com', port=443): Max retries exceeded with url: /city/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001F0143B7810>, 'Connection to www.lianjia.com timed out. (connect timeout=None)'))
        return  # None

    tree = etree.HTML(response.text)

    citys = {}
    province_list = tree.xpath("//div[@class='city_province']")
    for province in province_list:
        province_name = province.xpath(".//div[@class='city_list_tit c_b']/text()")[0]  # 省名, 例:山东
        province_city_name = province.xpath(".//ul/li/a/text()")  # 市名, 例:['菏泽', '济南', '济宁', '临沂', '青岛', '泰安', '潍坊', '威海', '烟台', '淄博']
        province_city_url = province.xpath(".//ul/li/a/@href")  # 对应的url,例:['https://heze.lianjia.com/', 'https://jn.lianjia.com/', 'https://jining.lianjia.com/', 'https://linyi.lianjia.com/', 'https://qd.lianjia.com/', 'https://ta.lianjia.com/', 'https://wf.lianjia.com/', 'https://weihai.lianjia.com/', 'https://yt.lianjia.com/', 'https://zb.lianjia.com/']

        citys[province_name] = dict(zip(province_city_name, province_city_url))  # 将市名和对应的url组成键值对
    return citys


if __name__ == '__main__':
    citys_data = get_all_city()
    with open("citys_data.json", "w", encoding='utf-8') as f:
        json.dump(citys_data, f, ensure_ascii=False)

 

json.dump(citys_data, f),默认ensure_ascii为True
json.dump(citys_data, f, ensure_ascii=False),指定ensure_ascii为False,写入中文
{
"\u5b89\u5fbd": {
"\u5b89\u5e86": "https://aq.lianjia.com/",
"\u6ec1\u5dde": "https://cz.fang.lianjia.com/",
"\u961c\u9633": "https://fy.lianjia.com/",
"\u5408\u80a5": "https://hf.lianjia.com/",
"\u9a6c\u978d\u5c71": "https://mas.lianjia.com/",
"\u829c\u6e56": "https://wuhu.lianjia.com/"
},
"\u5317\u4eac": {
"\u5317\u4eac": "https://bj.lianjia.com/"
},
"\u91cd\u5e86": {
"\u91cd\u5e86": "https://cq.lianjia.com/"
},
。。。。。。。
}
{
"安徽": {
"安庆": "https://aq.lianjia.com/",
"滁州": "https://cz.fang.lianjia.com/",
"阜阳": "https://fy.lianjia.com/",
"合肥": "https://hf.lianjia.com/",
"马鞍山": "https://mas.lianjia.com/",
"芜湖": "https://wuhu.lianjia.com/"
},
"北京": {
"北京": "https://bj.lianjia.com/"
},
"重庆": {
"重庆": "https://cq.lianjia.com/"
},
。。。。。。。
}