利用python爬虫自动爬取天气数据+机器学习随机森林算法预测温度+pyecharts可视化显示

最新推荐文章于 2026-06-28 17:29:25 发布

原创最新推荐文章于 2026-06-28 17:29:25 发布 · 362 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#python #随机森林 #开发语言

python 专栏收录该内容

2 篇文章

订阅专栏

python高级爬虫：

需要下载 edgedriver，下载完成后记得记住解压路径python脚本用。

python脚本代码：

import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from lxml import html
import time
from datetime import datetime
def get_weather_data(city, year, month):
    # 配置Edge选项
    edge_options = Options()
    edge_options.add_argument("--headless")  # 无头模式，不打开浏览器窗口
    edge_options.add_argument("--disable-gpu")

    # 指定EdgeDriver路径
    service = Service('D:\\edgediver\\msedgedriver.exe')

    # 初始化WebDriver
    driver = webdriver.Edge(service=service, options=edge_options)

    if month < 10:
        month = '0' + str(month)
    else:
        month = str(month)
    # 打开目标网页
    url = f'https://lishi.tianqi.com/{city}/{year}{month}.html'
    driver.get(url)

    # 等待页面加载
    time.sleep(3)

    # 找到“查看更多”按钮并点击
    try:
        more_button = driver.find_element(By.CLASS_NAME, 'lishidesc2')
        more_button.click()
    except Exception as e:
        print(f"该网页尚未更新最新的每月总结，您本月尚未结束，请结束后再来收集！！")

    # 等待更多内容加载
    time.sleep(3)

    # 获取页面源码
    html_content = driver.page_source

    # 解析HTML
    tree = html.fromstring(html_content)

    # 提取所有天气预报条目
    forecast_items = tree.xpath('//ul[@class="thrui"]/li')

    weather_info = []
    # 遍历每个条目并提取信息
    for item in forecast_items:
        day_weather_info = {}
        day_weather_info['date'] = item.xpath('.//div[@class="th200"]/text()')[0].strip()
        day_weather_info['high_temp'] = item.xpath('.//div[@class="th140"][1]/text()')[0].strip()
        day_weather_info['low_temp'] = item.xpath('.//div[@class="th140"][2]/text()')[0].strip()
        day_weather_info['weather'] = item.xpath('.//div[@class="th140"][3]/text()')[0].strip()
        day_weather_info['wind'] = item.xpath('.//div[@class="th140"][4]/text()')[0].strip()
        weather_info.append(day_weather_info)


    # 关闭浏览器
    driver.quit()
    return weather_info

def save_weather():
    year_weather_data = []
    # 获取当前时间防止爬取没有月份的网页
    current_time = datetime.now()
    current_month = current_time.month
    current_year = current_time.year
    # current_day = current_time.day
    #输入您的详细城市名称
    city = 'dalian' 
    for i in range(1, current_month+1):
        year_weather_data.extend(get_weather_data(city, current_year, i))


    # 定义CSV文件路径
    csv_file_path = f'D:\\python_project\\Dazuoye\\{city}_weather_{current_year}.csv'

    # 写入CSV文件
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['日期', '最高气温', '最低气温', '天气', '风级'])
        writer.writerows([data['date'], data['high_temp'], data['low_temp'], data['weather'], data['wind']] for data in year_weather_data)

save_weather()

数据获取成功图：

机器学习部分:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取数据
train2024 = pd.read_csv('dalian_weather_2024.csv', encoding='utf-8')
features2023 = pd.read_csv('dalian_weather_2023.csv', encoding='utf-8')
# 合并数据
combined_data = pd.concat([features2023, train2024], ignore_index=True)
#截取日期的月份去掉星期几和几号
combined_data['日期'] = combined_data['日期'].apply(lambda x: x[0:10])
# 使用 loc 方法更新最高气温和最低气温
for col in ['最高气温', '最低气温']:
    combined_data[col] = combined_data[col].str.replace('℃', '').str.strip().astype(int)
#拆分日期
combined_data['日期'] = combined_data['日期'].apply(lambda x: x[0:10])
combined_data['日期'] = pd.to_datetime(combined_data['日期'])
combined_data['年'] = combined_data['日期'].dt.year
combined_data['月'] = combined_data['日期'].dt.month
combined_data['日'] = combined_data['日期'].dt.day
combined_data['星期'] = combined_data['日期'].dt.weekday
# 创建一个映射字典
day_map = {
    1: 'Mon',
    2: 'Tue',
    3: 'Wed',
    4: 'Thu',
    5: 'Fri',
    6: 'Sat',
    0: 'Sun'
}

# 使用映射字典将完整的星期名称转换为缩写
combined_data['星期'] = combined_data['星期'].map(day_map)

#删除列
combined_data.drop(['日期'], axis=1, inplace=True)


# 增加avg列给机器学习用
combined_data = combined_data.assign(平均气温=(combined_data['最高气温'] + combined_data['最低气温']) / 2)
#调整未知把气温放到日期后面
combined_data = combined_data[['年', '月', '日', '星期', '最高气温', '最低气温']]


#转换为干净数据

combined_data.to_csv('clean_data.csv', index=False)


#One-Hot Encoding热编码全部转化为数值数据
combined_data = pd.get_dummies(combined_data)
X = combined_data.drop(columns=['最高气温', '最低气温'],axis=1)
y = combined_data[['最高气温', '最低气温']]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)
print('训练集特征样式：', train_X.shape)
print('测试集特征样式：', test_X.shape)
print('训练集标签样式：', train_y.shape)
print('测试集标签样式：', test_y.shape)

#建模
model = RandomForestRegressor(n_estimators=1000, random_state=42)
model.fit(train_X, train_y)

# 预测结果
predictions = model.predict(test_X)
# 计算均方根误差
mse = mean_squared_error(test_y, predictions)
# 计算均方根误差
rmse = np.sqrt(mse)
# 计算平均绝对误差
mae = mean_absolute_error(test_y, predictions)
#以上误差均数值越小性能预估越准
print('MSE:', mse)
print('RMSE关注较大的误差:', rmse)
print('MAE:', mae)


# 得到特征重要性
importances = list(model.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(train_X.columns, importances)]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]




dates = pd.date_range(start='2023-01-01', end='2024-12-31')

future_data = pd.DataFrame({
    '年': dates.year,
    '月': dates.month,
    '日': dates.day,
    '星期': dates.day_name()
})
future_data['星期'] = future_data['星期'].apply(lambda x: x[0:3])

# print(future_data.head(5))
# One-Hot Encoding
future_data = pd.get_dummies(future_data)

# 确保未来的特征与训练特征一致
missing_cols = set(train_X.columns) - set(future_data.columns)
for c in missing_cols:
    future_data[c] = 0
future_data = future_data[train_X.columns]

# 查看数据
# print(future_data.head(5))

# 预测结果
future_predictions = model.predict(future_data)
future_df = pd.DataFrame(future_predictions, columns=['预测最高气温', '预测最低气温'])
future_df['日期'] = pd.date_range(start='2023-01-01', periods=len(future_df), freq='D')
future_df = future_df[['日期', '预测最高气温', '预测最低气温']]

# 查看预测结果
print(future_df)
#画出2023年-2024年预测最高最低温度图
plt.figure(figsize=(10, 6))

# 将年、月、日列组合成日期字符串
combined_data['日期'] = pd.to_datetime(combined_data.apply(
    lambda row: f"{row['年']}-{row['月']:02d}-{row['日']:02d}", axis=1
))
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.plot(combined_data['日期'], combined_data['最高气温'], label='实际最高温度')
plt.plot(combined_data['日期'], combined_data['最低气温'], label='实际最低温度')
plt.plot(future_df['日期'], future_df['预测最高气温'], label='预测最高温度')
plt.plot(future_df['日期'], future_df['预测最低气温'], label='预测最低温度')
plt.xlabel('日期')
plt.ylabel('温度(°C)')
plt.title('2023-2024年的天气最高最低气温预报')
plt.legend()
#保存图片
plt.savefig('weather_predictions2023-2024.png')
#显示图片
plt.show()
#存储预测天气温度csv
future_df.to_csv('future_weather_predictions2023-2024.csv', index=False)

运行后截图：

pyecharts可视化显示：

import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar, Timeline
import re

# 读取数据
df = pd.read_csv('dalian_weather_2025.csv', encoding='utf-8')

# 自定义解析函数
def custom_date_parser(date_str):
    # 使用正则表达式去除中文星期几
    date_str = re.sub(r'星期[一二三四五六日]', '', date_str)
    return pd.to_datetime(date_str)

# 应用自定义解析函数
df['日期'] = df['日期'].apply(custom_date_parser)

# 提取年、月、日
df['year'] = df['日期'].dt.year
df['month'] = df['日期'].dt.month
df['day'] = df['日期'].dt.day

# 确保最高气温和最低气温是整数
for col in ['最高气温', '最低气温']:
    df[col] = df[col].str.replace('℃', '').str.strip().astype(int)
print(df)
# 创建时间线
timeline = Timeline()
timeline.add_schema(play_interval=1000)

# 按月份分组
monthly_groups = df.groupby(['year', 'month'])

# 遍历每个月的数据
for (year, month), group in monthly_groups:
    month_str = f"{year}-{month:02d}"

    # 创建双柱状图
    bar = Bar()
    bar.add_xaxis(group['日期'].dt.strftime('%Y-%m-%d').tolist())
    bar.add_yaxis("最高气温", group['最高气温'].tolist(), category_gap="50%")
    bar.add_yaxis("最低气温", group['最低气温'].tolist(), category_gap="50%")

    # 设置系列配置项
    bar.set_series_opts(
        label_opts=opts.LabelOpts(is_show=True),
        itemstyle_opts={
            "normal": {
                "color": lambda params: "#c23531" if params.name == "最高气温" else "#2f4554"
            }
        },
    )

    # 设置全局配置项
    bar.set_global_opts(
        title_opts=opts.TitleOpts(title=f"大连{month_str}最高最低气温"),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="shadow"),
        legend_opts=opts.LegendOpts(pos_bottom="1%"),
    )



    # 将复合图表添加到时间线
    timeline.add(bar, month_str)

# 保存为HTML文件
timeline.render('monthly_daily_temperatures_bar.html')

html打开后结果显示：