python高级爬虫:
需要下载 edgedriver,下载完成后记得记住解压路径python脚本用。


python脚本代码:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from lxml import html
import time
from datetime import datetime
def get_weather_data(city, year, month):
# 配置Edge选项
edge_options = Options()
edge_options.add_argument("--headless") # 无头模式,不打开浏览器窗口
edge_options.add_argument("--disable-gpu")
# 指定EdgeDriver路径
service = Service('D:\\edgediver\\msedgedriver.exe')
# 初始化WebDriver
driver = webdriver.Edge(service=service, options=edge_options)
if month < 10:
month = '0' + str(month)
else:
month = str(month)
# 打开目标网页
url = f'https://lishi.tianqi.com/{city}/{year}{month}.html'
driver.get(url)
# 等待页面加载
time.sleep(3)
# 找到“查看更多”按钮并点击
try:
more_button = driver.find_element(By.CLASS_NAME, 'lishidesc2')
more_button.click()
except Exception as e:
print(f"该网页尚未更新最新的每月总结,您本月尚未结束,请结束后再来收集!!")
# 等待更多内容加载
time.sleep(3)
# 获取页面源码
html_content = driver.page_source
# 解析HTML
tree = html.fromstring(html_content)
# 提取所有天气预报条目
forecast_items = tree.xpath('//ul[@class="thrui"]/li')
weather_info = []
# 遍历每个条目并提取信息
for item in forecast_items:
day_weather_info = {}
day_weather_info['date'] = item.xpath('.//div[@class="th200"]/text()')[0].strip()
day_weather_info['high_temp'] = item.xpath('.//div[@class="th140"][1]/text()')[0].strip()
day_weather_info['low_temp'] = item.xpath('.//div[@class="th140"][2]/text()')[0].strip()
day_weather_info['weather'] = item.xpath('.//div[@class="th140"][3]/text()')[0].strip()
day_weather_info['wind'] = item.xpath('.//div[@class="th140"][4]/text()')[0].strip()
weather_info.append(day_weather_info)
# 关闭浏览器
driver.quit()
return weather_info
def save_weather():
year_weather_data = []
# 获取当前时间防止爬取没有月份的网页
current_time = datetime.now()
current_month = current_time.month
current_year = current_time.year
# current_day = current_time.day
#输入您的详细城市名称
city = 'dalian'
for i in range(1, current_month+1):
year_weather_data.extend(get_weather_data(city, current_year, i))
# 定义CSV文件路径
csv_file_path = f'D:\\python_project\\Dazuoye\\{city}_weather_{current_year}.csv'
# 写入CSV文件
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['日期', '最高气温', '最低气温', '天气', '风级'])
writer.writerows([data['date'], data['high_temp'], data['low_temp'], data['weather'], data['wind']] for data in year_weather_data)
save_weather()
数据获取成功图:

机器学习部分:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取数据
train2024 = pd.read_csv('dalian_weather_2024.csv', encoding='utf-8')
features2023 = pd.read_csv('dalian_weather_2023.csv', encoding='utf-8')
# 合并数据
combined_data = pd.concat([features2023, train2024], ignore_index=True)
#截取日期的月份去掉星期几和几号
combined_data['日期'] = combined_data['日期'].apply(lambda x: x[0:10])
# 使用 loc 方法更新最高气温和最低气温
for col in ['最高气温', '最低气温']:
combined_data[col] = combined_data[col].str.replace('℃', '').str.strip().astype(int)
#拆分日期
combined_data['日期'] = combined_data['日期'].apply(lambda x: x[0:10])
combined_data['日期'] = pd.to_datetime(combined_data['日期'])
combined_data['年'] = combined_data['日期'].dt.year
combined_data['月'] = combined_data['日期'].dt.month
combined_data['日'] = combined_data['日期'].dt.day
combined_data['星期'] = combined_data['日期'].dt.weekday
# 创建一个映射字典
day_map = {
1: 'Mon',
2: 'Tue',
3: 'Wed',
4: 'Thu',
5: 'Fri',
6: 'Sat',
0: 'Sun'
}
# 使用映射字典将完整的星期名称转换为缩写
combined_data['星期'] = combined_data['星期'].map(day_map)
#删除列
combined_data.drop(['日期'], axis=1, inplace=True)
# 增加avg列给机器学习用
combined_data = combined_data.assign(平均气温=(combined_data['最高气温'] + combined_data['最低气温']) / 2)
#调整未知把气温放到日期后面
combined_data = combined_data[['年', '月', '日', '星期', '最高气温', '最低气温']]
#转换为干净数据
combined_data.to_csv('clean_data.csv', index=False)
#One-Hot Encoding热编码全部转化为数值数据
combined_data = pd.get_dummies(combined_data)
X = combined_data.drop(columns=['最高气温', '最低气温'],axis=1)
y = combined_data[['最高气温', '最低气温']]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)
print('训练集特征样式:', train_X.shape)
print('测试集特征样式:', test_X.shape)
print('训练集标签样式:', train_y.shape)
print('测试集标签样式:', test_y.shape)
#建模
model = RandomForestRegressor(n_estimators=1000, random_state=42)
model.fit(train_X, train_y)
# 预测结果
predictions = model.predict(test_X)
# 计算均方根误差
mse = mean_squared_error(test_y, predictions)
# 计算均方根误差
rmse = np.sqrt(mse)
# 计算平均绝对误差
mae = mean_absolute_error(test_y, predictions)
#以上误差均数值越小性能预估越准
print('MSE:', mse)
print('RMSE关注较大的误差:', rmse)
print('MAE:', mae)
# 得到特征重要性
importances = list(model.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(train_X.columns, importances)]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
dates = pd.date_range(start='2023-01-01', end='2024-12-31')
future_data = pd.DataFrame({
'年': dates.year,
'月': dates.month,
'日': dates.day,
'星期': dates.day_name()
})
future_data['星期'] = future_data['星期'].apply(lambda x: x[0:3])
# print(future_data.head(5))
# One-Hot Encoding
future_data = pd.get_dummies(future_data)
# 确保未来的特征与训练特征一致
missing_cols = set(train_X.columns) - set(future_data.columns)
for c in missing_cols:
future_data[c] = 0
future_data = future_data[train_X.columns]
# 查看数据
# print(future_data.head(5))
# 预测结果
future_predictions = model.predict(future_data)
future_df = pd.DataFrame(future_predictions, columns=['预测最高气温', '预测最低气温'])
future_df['日期'] = pd.date_range(start='2023-01-01', periods=len(future_df), freq='D')
future_df = future_df[['日期', '预测最高气温', '预测最低气温']]
# 查看预测结果
print(future_df)
#画出2023年-2024年预测最高最低温度图
plt.figure(figsize=(10, 6))
# 将年、月、日列组合成日期字符串
combined_data['日期'] = pd.to_datetime(combined_data.apply(
lambda row: f"{row['年']}-{row['月']:02d}-{row['日']:02d}", axis=1
))
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.plot(combined_data['日期'], combined_data['最高气温'], label='实际最高温度')
plt.plot(combined_data['日期'], combined_data['最低气温'], label='实际最低温度')
plt.plot(future_df['日期'], future_df['预测最高气温'], label='预测最高温度')
plt.plot(future_df['日期'], future_df['预测最低气温'], label='预测最低温度')
plt.xlabel('日期')
plt.ylabel('温度(°C)')
plt.title('2023-2024年的天气最高最低气温预报')
plt.legend()
#保存图片
plt.savefig('weather_predictions2023-2024.png')
#显示图片
plt.show()
#存储预测天气温度csv
future_df.to_csv('future_weather_predictions2023-2024.csv', index=False)
运行后截图:

pyecharts可视化显示:
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar, Timeline
import re
# 读取数据
df = pd.read_csv('dalian_weather_2025.csv', encoding='utf-8')
# 自定义解析函数
def custom_date_parser(date_str):
# 使用正则表达式去除中文星期几
date_str = re.sub(r'星期[一二三四五六日]', '', date_str)
return pd.to_datetime(date_str)
# 应用自定义解析函数
df['日期'] = df['日期'].apply(custom_date_parser)
# 提取年、月、日
df['year'] = df['日期'].dt.year
df['month'] = df['日期'].dt.month
df['day'] = df['日期'].dt.day
# 确保最高气温和最低气温是整数
for col in ['最高气温', '最低气温']:
df[col] = df[col].str.replace('℃', '').str.strip().astype(int)
print(df)
# 创建时间线
timeline = Timeline()
timeline.add_schema(play_interval=1000)
# 按月份分组
monthly_groups = df.groupby(['year', 'month'])
# 遍历每个月的数据
for (year, month), group in monthly_groups:
month_str = f"{year}-{month:02d}"
# 创建双柱状图
bar = Bar()
bar.add_xaxis(group['日期'].dt.strftime('%Y-%m-%d').tolist())
bar.add_yaxis("最高气温", group['最高气温'].tolist(), category_gap="50%")
bar.add_yaxis("最低气温", group['最低气温'].tolist(), category_gap="50%")
# 设置系列配置项
bar.set_series_opts(
label_opts=opts.LabelOpts(is_show=True),
itemstyle_opts={
"normal": {
"color": lambda params: "#c23531" if params.name == "最高气温" else "#2f4554"
}
},
)
# 设置全局配置项
bar.set_global_opts(
title_opts=opts.TitleOpts(title=f"大连{month_str}最高最低气温"),
tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="shadow"),
legend_opts=opts.LegendOpts(pos_bottom="1%"),
)
# 将复合图表添加到时间线
timeline.add(bar, month_str)
# 保存为HTML文件
timeline.render('monthly_daily_temperatures_bar.html')
html打开后结果显示:

715

被折叠的 条评论
为什么被折叠?



