深度学习中的强化学习基础

强化学习（Reinforcement Learning, RL）是机器学习的一个重要分支，它关注智能体（Agent）如何在与环境的交互中学习最优策略，以最大化累积奖励。近年来，深度强化学习（Deep Reinforcement Learning, DRL）将深度学习与强化学习相结合，取得了一系列突破性的成果，如AlphaGo战胜人类围棋冠军、机器人操作复杂物体等。理解强化学习的基本原理和实现方法，

雷帝木木

13人浏览 · 2026-03-29 09:59:14

雷帝木木 · 2026-03-29 09:59:14 发布

深度学习中的强化学习基础

1. 背景与动机

理解强化学习的基本原理和实现方法，对于掌握现代人工智能技术至关重要。本文将从基础概念出发，深入探讨强化学习的核心原理、算法实现和应用场景，为读者提供全面的强化学习知识体系。

2. 核心原理

2.1 强化学习的基本概念

强化学习的核心概念包括：

智能体（Agent）：学习和执行动作的实体
环境（Environment）：智能体交互的外部世界
状态（State）：环境的当前情况
动作（Action）：智能体可以执行的操作
奖励（Reward）：环境对智能体动作的反馈
策略（Policy）：智能体选择动作的规则
价值函数（Value Function）：评估状态或状态-动作对的长期价值
Q函数（Action-Value Function）：评估在特定状态下执行特定动作的长期价值

2.2 强化学习的工作原理

强化学习的基本工作流程如下：

智能体观察环境：获取当前状态
智能体选择动作：根据策略选择动作
环境执行动作：环境状态发生变化
环境给予奖励：环境返回奖励信号
智能体更新策略：根据奖励信号更新策略
重复上述过程：直到达到终止条件

3. 代码实现

3.1 Q-Learning算法

import numpy as np
import gym

# 创建环境
env = gym.make('CartPole-v1')

# Q-Learning参数
learning_rate = 0.1
discount_factor = 0.99
exploration_rate = 1.0
exploration_decay = 0.995
exploration_min = 0.01

# 离散化状态空间
def discretize_state(state, bins):
    state_index = []
    for i in range(len(state)):
        state_index.append(np.digitize(state[i], bins[i]) - 1)
    return tuple(state_index)

# 创建状态空间的离散化 bins
bins = [
    np.linspace(-4.8, 4.8, 20),  # 小车位置
    np.linspace(-4, 4, 20),      # 小车速度
    np.linspace(-0.418, 0.418, 20),  # 杆子角度
    np.linspace(-4, 4, 20)       # 杆子角速度
]

# 初始化Q表
state_space_size = [len(bin) for bin in bins]
action_space_size = env.action_space.n
q_table = np.zeros(state_space_size + [action_space_size])

# 训练参数
episodes = 10000
max_steps = 500

# 训练Q-Learning算法
for episode in range(episodes):
    state = env.reset()
    state = discretize_state(state[0], bins)
    done = False
    step = 0
    
    while not done and step < max_steps:
        # 探索-利用策略
        if np.random.uniform(0, 1) < exploration_rate:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
        
        # 执行动作
        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state, bins)
        
        # 更新Q值
        old_value = q_table[state + (action,)]
        next_max = np.max(q_table[next_state])
        new_value = old_value + learning_rate * (reward + discount_factor * next_max - old_value)
        q_table[state + (action,)] = new_value
        
        state = next_state
        step += 1
    
    # 衰减探索率
    exploration_rate = max(exploration_min, exploration_rate * exploration_decay)
    
    if (episode + 1) % 1000 == 0:
        print(f"Episode: {episode+1}, Exploration Rate: {exploration_rate:.4f}, Steps: {step}")

# 测试训练好的模型
test_episodes = 10
test_steps = 0

for episode in range(test_episodes):
    state = env.reset()
    state = discretize_state(state[0], bins)
    done = False
    step = 0
    
    while not done and step < max_steps:
        action = np.argmax(q_table[state])
        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state, bins)
        state = next_state
        step += 1
    
    test_steps += step
    print(f"Test Episode: {episode+1}, Steps: {step}")

print(f"Average Test Steps: {test_steps / test_episodes}")
env.close()

3.2 Deep Q-Network (DQN)算法

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque
import random

# DQN模型
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 经验回放缓冲区
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# DQN训练参数
state_size = 4
action_size = 2
learning_rate = 0.001
discount_factor = 0.99
exploration_rate = 1.0
exploration_decay = 0.995
exploration_min = 0.01
batch_size = 64
replay_buffer_capacity = 10000
update_target_frequency = 1000

# 创建环境
env = gym.make('CartPole-v1')

# 初始化模型
policy_net = DQN(state_size, action_size)
target_net = DQN(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())

# 优化器
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

# 经验回放缓冲区
replay_buffer = ReplayBuffer(replay_buffer_capacity)

# 训练参数
episodes = 10000
max_steps = 500
step_count = 0

# 训练DQN算法
for episode in range(episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    
    while not done and step < max_steps:
        # 探索-利用策略
        if np.random.uniform(0, 1) < exploration_rate:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = torch.argmax(policy_net(state_tensor)).item()
        
        # 执行动作
        next_state, reward, done, _, _ = env.step(action)
        
        # 存储经验
        replay_buffer.add(state, action, reward, next_state, done)
        
        # 训练模型
        if len(replay_buffer) >= batch_size:
            batch = replay_buffer.sample(batch_size)
            states = torch.FloatTensor([transition[0] for transition in batch])
            actions = torch.LongTensor([transition[1] for transition in batch])
            rewards = torch.FloatTensor([transition[2] for transition in batch])
            next_states = torch.FloatTensor([transition[3] for transition in batch])
            dones = torch.FloatTensor([transition[4] for transition in batch])
            
            # 计算当前Q值
            current_q = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            
            # 计算目标Q值
            with torch.no_grad():
                next_q = target_net(next_states).max(1)[0]
                target_q = rewards + discount_factor * next_q * (1 - dones)
            
            # 计算损失
            loss = nn.MSELoss()(current_q, target_q)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            step_count += 1
            
            # 更新目标网络
            if step_count % update_target_frequency == 0:
                target_net.load_state_dict(policy_net.state_dict())
        
        state = next_state
        step += 1
    
    # 衰减探索率
    exploration_rate = max(exploration_min, exploration_rate * exploration_decay)
    
    if (episode + 1) % 1000 == 0:
        print(f"Episode: {episode+1}, Exploration Rate: {exploration_rate:.4f}, Steps: {step}")

# 测试训练好的模型
test_episodes = 10
test_steps = 0

for episode in range(test_episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    
    while not done and step < max_steps:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action = torch.argmax(policy_net(state_tensor)).item()
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        step += 1
    
    test_steps += step
    print(f"Test Episode: {episode+1}, Steps: {step}")

print(f"Average Test Steps: {test_steps / test_episodes}")
env.close()

3.3 Policy Gradient算法

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym

# Policy网络
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=-1)
        return x

# 训练参数
state_size = 4
action_size = 2
learning_rate = 0.001
discount_factor = 0.99
episodes = 1000
max_steps = 500

# 创建环境
env = gym.make('CartPole-v1')

# 初始化模型
policy_net = PolicyNetwork(state_size, action_size)
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

# 训练Policy Gradient算法
for episode in range(episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    rewards = []
    log_probs = []
    
    while not done and step < max_steps:
        # 选择动作
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = policy_net(state_tensor)
        action = torch.multinomial(action_probs, 1).item()
        log_prob = torch.log(action_probs[0, action])
        
        # 执行动作
        next_state, reward, done, _, _ = env.step(action)
        
        # 存储奖励和对数概率
        rewards.append(reward)
        log_probs.append(log_prob)
        
        state = next_state
        step += 1
    
    # 计算折扣奖励
    discounted_rewards = []
    cumulative_reward = 0
    for reward in reversed(rewards):
        cumulative_reward = reward + discount_factor * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)
    
    # 标准化奖励
    discounted_rewards = torch.FloatTensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)
    
    # 计算损失
    loss = 0
    for log_prob, reward in zip(log_probs, discounted_rewards):
        loss -= log_prob * reward
    
    # 反向传播
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode+1}, Steps: {step}")

# 测试训练好的模型
test_episodes = 10
test_steps = 0

for episode in range(test_episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    
    while not done and step < max_steps:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = policy_net(state_tensor)
            action = torch.argmax(action_probs).item()
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        step += 1
    
    test_steps += step
    print(f"Test Episode: {episode+1}, Steps: {step}")

print(f"Average Test Steps: {test_steps / test_episodes}")
env.close()

4. 性能对比

4.1 不同强化学习算法性能对比

算法	收敛速度	稳定性	样本效率	计算复杂度	适用场景
Q-Learning	中	高	低	低	小规模离散状态空间
DQN	高	中	中	中	连续状态空间，离散动作
Policy Gradient	中	低	低	中	连续动作空间
Actor-Critic	高	高	中	高	连续状态和动作空间
PPO	高	高	高	高	复杂环境，需要稳定训练

4.2 性能测试代码

import time
import gym

# 测试不同算法的性能
def test_algorithm_performance(algorithm_name, test_function):
    start_time = time.time()
    average_steps = test_function()
    end_time = time.time()
    print(f"{algorithm_name} 平均步数: {average_steps:.2f}")
    print(f"{algorithm_name} 训练时间: {end_time - start_time:.2f}秒")
    return average_steps, end_time - start_time

# 测试Q-Learning
def test_q_learning():
    # 实现Q-Learning测试代码
    # ...
    return 200  # 示例值

# 测试DQN
def test_dqn():
    # 实现DQN测试代码
    # ...
    return 400  # 示例值

# 测试Policy Gradient
def test_policy_gradient():
    # 实现Policy Gradient测试代码
    # ...
    return 300  # 示例值

# 运行性能测试
test_algorithm_performance("Q-Learning", test_q_learning)
test_algorithm_performance("DQN", test_dqn)
test_algorithm_performance("Policy Gradient", test_policy_gradient)

5. 高级应用

5.1 Actor-Critic算法

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque
import random

# Actor网络
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=-1)
        return x

# Critic网络
class Critic(nn.Module):
    def __init__(self, state_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 训练参数
state_size = 4
action_size = 2
actor_lr = 0.0001
critic_lr = 0.001
discount_factor = 0.99
episodes = 1000
max_steps = 500

# 创建环境
env = gym.make('CartPole-v1')

# 初始化模型
actor = Actor(state_size, action_size)
critic = Critic(state_size)
actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

# 训练Actor-Critic算法
for episode in range(episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    
    while not done and step < max_steps:
        # 选择动作
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = actor(state_tensor)
        action = torch.multinomial(action_probs, 1).item()
        log_prob = torch.log(action_probs[0, action])
        
        # 执行动作
        next_state, reward, done, _, _ = env.step(action)
        
        # 计算价值
        state_value = critic(state_tensor)
        next_state_value = critic(torch.FloatTensor(next_state).unsqueeze(0))
        
        # 计算TD误差
        td_target = reward + discount_factor * next_state_value * (1 - done)
        td_error = td_target - state_value
        
        # 更新Critic
        critic_loss = td_error.pow(2).mean()
        critic_optimizer.zero_grad()
        critic_loss.backward(retain_graph=True)
        critic_optimizer.step()
        
        # 更新Actor
        actor_loss = -log_prob * td_error.detach()
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()
        
        state = next_state
        step += 1
    
    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode+1}, Steps: {step}")

# 测试训练好的模型
test_episodes = 10
test_steps = 0

for episode in range(test_episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    
    while not done and step < max_steps:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = actor(state_tensor)
            action = torch.argmax(action_probs).item()
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        step += 1
    
    test_steps += step
    print(f"Test Episode: {episode+1}, Steps: {step}")

print(f"Average Test Steps: {test_steps / test_episodes}")
env.close()

5.2 Proximal Policy Optimization (PPO)算法

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque
import random

# PPO模型
class PPOActor(nn.Module):
    def __init__(self, state_size, action_size):
        super(PPOActor, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=-1)
        return x

class PPOCritic(nn.Module):
    def __init__(self, state_size):
        super(PPOCritic, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 训练参数
state_size = 4
action_size = 2
actor_lr = 0.0003
critic_lr = 0.001
discount_factor = 0.99
gae_lambda = 0.95
clip_epsilon = 0.2
update_epochs = 4
batch_size = 64
ppo_epochs = 1000
max_steps = 500

# 创建环境
env = gym.make('CartPole-v1')

# 初始化模型
actor = PPOActor(state_size, action_size)
critic = PPOCritic(state_size)
actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

# 训练PPO算法
for epoch in range(ppo_epochs):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    states = []
    actions = []
    rewards = []
    old_log_probs = []
    values = []
    
    while not done and step < max_steps:
        # 选择动作
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = actor(state_tensor)
        action = torch.multinomial(action_probs, 1).item()
        log_prob = torch.log(action_probs[0, action])
        value = critic(state_tensor)
        
        # 执行动作
        next_state, reward, done, _, _ = env.step(action)
        
        # 存储数据
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        old_log_probs.append(log_prob)
        values.append(value)
        
        state = next_state
        step += 1
    
    # 计算GAE
    next_state_value = critic(torch.FloatTensor(next_state).unsqueeze(0))
    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + discount_factor * next_state_value * (1 - done) - values[i]
        gae = delta + discount_factor * gae_lambda * (1 - done) * gae
        returns.insert(0, gae + values[i])
        next_state_value = values[i]
        done = False
    
    # 转换为张量
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    old_log_probs = torch.stack(old_log_probs)
    returns = torch.FloatTensor(returns)
    values = torch.stack(values).squeeze()
    
    # 计算优势
    advantages = returns - values
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    # 更新策略
    for _ in range(update_epochs):
        # 批量处理
        indices = torch.randperm(len(states))
        for start in range(0, len(states), batch_size):
            end = start + batch_size
            batch_indices = indices[start:end]
            
            batch_states = states[batch_indices]
            batch_actions = actions[batch_indices]
            batch_old_log_probs = old_log_probs[batch_indices]
            batch_returns = returns[batch_indices]
            batch_advantages = advantages[batch_indices]
            
            # 计算新的动作概率和价值
            action_probs = actor(batch_states)
            new_log_probs = torch.log(action_probs.gather(1, batch_actions.unsqueeze(1))).squeeze()
            new_values = critic(batch_states).squeeze()
            
            # 计算比率
            ratio = torch.exp(new_log_probs - batch_old_log_probs)
            
            # 计算PPO损失
            surr1 = ratio * batch_advantages
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * batch_advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            
            # 计算Critic损失
            critic_loss = nn.MSELoss()(new_values, batch_returns)
            
            # 反向传播
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()
            
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()
    
    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch+1}, Steps: {step}")

# 测试训练好的模型
test_episodes = 10
test_steps = 0

for episode in range(test_episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    
    while not done and step < max_steps:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = actor(state_tensor)
            action = torch.argmax(action_probs).item()
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        step += 1
    
    test_steps += step
    print(f"Test Episode: {episode+1}, Steps: {step}")

print(f"Average Test Steps: {test_steps / test_episodes}")
env.close()

5.3 强化学习在游戏中的应用

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque
import random

# 游戏环境配置
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# DQN模型
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 经验回放缓冲区
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

# 训练参数
learning_rate = 0.0005
discount_factor = 0.99
exploration_rate = 1.0
exploration_decay = 0.995
exploration_min = 0.01
batch_size = 64
replay_buffer_capacity = 100000
update_target_frequency = 1000

# 初始化模型
policy_net = DQN(state_size, action_size)
target_net = DQN(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())

# 优化器
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

# 经验回放缓冲区
replay_buffer = ReplayBuffer(replay_buffer_capacity)

# 训练参数
episodes = 1000ax_steps = 1000
step_count = 0

# 训练DQN算法
for episode in range(episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    total_reward = 0
    
    while not done and step < max_steps:
        # 探索-利用策略
        if np.random.uniform(0, 1) < exploration_rate:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = torch.argmax(policy_net(state_tensor)).item()
        
        # 执行动作
        next_state, reward, done, _, _ = env.step(action)
        
        # 存储经验
        replay_buffer.add(state, action, reward, next_state, done)
        total_reward += reward
        
        # 训练模型
        if len(replay_buffer) >= batch_size:
            batch = replay_buffer.sample(batch_size)
            states = torch.FloatTensor([transition[0] for transition in batch])
            actions = torch.LongTensor([transition[1] for transition in batch])
            rewards = torch.FloatTensor([transition[2] for transition in batch])
            next_states = torch.FloatTensor([transition[3] for transition in batch])
            dones = torch.FloatTensor([transition[4] for transition in batch])
            
            # 计算当前Q值
            current_q = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            
            # 计算目标Q值
            with torch.no_grad():
                next_q = target_net(next_states).max(1)[0]
                target_q = rewards + discount_factor * next_q * (1 - dones)
            
            # 计算损失
            loss = nn.MSELoss()(current_q, target_q)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            step_count += 1
            
            # 更新目标网络
            if step_count % update_target_frequency == 0:
                target_net.load_state_dict(policy_net.state_dict())
        
        state = next_state
        step += 1
    
    # 衰减探索率
    exploration_rate = max(exploration_min, exploration_rate * exploration_decay)
    
    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode+1}, Exploration Rate: {exploration_rate:.4f}, Steps: {step}, Total Reward: {total_reward:.2f}")

# 测试训练好的模型
test_episodes = 10
test_rewards = 0

for episode in range(test_episodes):
    state = env.reset()
    state = state[0]
    done = False
    step = 0
    total_reward = 0
    
    while not done and step < max_steps:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action = torch.argmax(policy_net(state_tensor)).item()
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        total_reward += reward
        step += 1
    
    test_rewards += total_reward
    print(f"Test Episode: {episode+1}, Steps: {step}, Total Reward: {total_reward:.2f}")

print(f"Average Test Reward: {test_rewards / test_episodes:.2f}")
env.close()