迷宫由一个二维数组 self.maze 表示,其中:
- 0 表示可以行走的空白区域,
- 1 表示障碍物,
- 2 表示迷宫的起始位置,
- 3 表示迷宫的目标位置。
动作定义了智能体可以执行的动作集合,在这个迷宫问题中,动作包括向上、向下、向左、向右移动,分别用数字 0、1、2、3 表示。
奖励:
- 检查智能体是否到达目标位置,如果是,则设置 self.done 为 True 并给予正奖励 1.0,表示成功到达目标。
- 如果没有到达目标,则给予负奖励 -0.1,表示智能体需要继续寻找目标。
- 如果撞墙或遇到障碍物,则原地不动。
import numpy as npclass SimpleMazeEnv:def __init__(self):# 定义迷宫矩阵self.maze = np.array([[2, 0, 0, 0, 0], [0, 1, 1, 1, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 3]]) # 其中0表示空白区域,1表示障碍物,2表示起始位置,3表示目标位置self.start_state = (0, 0)self.goal_state = (4, 4)self.state = self.start_stateself.done = False# 定义动作空间:上、下、左、右self.action_space = [0, 1, 2, 3]self.action_map = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)} # 上 # 下 # 左 # 右def reset(self):self.state = self.start_stateself.done = Falsereturn self.statedef step(self, action):if self.done:raise Exception("Episode has ended. Please reset the environment.")# 计算新状态move = self.action_map[action]new_state = (self.state[0] + move[0], self.state[1] + move[1])# 检查边界条件和障碍物if (0 <= new_state[0] < self.maze.shape[0]and 0 <= new_state[1] < self.maze.shape[1]and self.maze[new_state] != 1): # 撞墙或遇到障碍物,智能体仍然保持在原来的位置self.state = new_state# 检查是否到达目标if self.state == self.goal_state:self.done = Truereward = 1.0else:reward = -0.1 # 每步的惩罚return self.state, reward, self.donedef render(self):# 创建一个字符矩阵来显示迷宫maze_visual = np.full(self.maze.shape, ".", dtype=str) # . 表示空白区域。maze_visual[self.maze == 1] = "#" # 表示障碍物。maze_visual[self.goal_state] = "G" # G 表示目标位置。maze_visual[self.state] = "A" # A 表示智能体当前位置。for row in maze_visual:print(" ".join(row))print("--------------------------------")class QLearningAgent:def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):self.env = envself.alpha = alphaself.gamma = gammaself.epsilon = epsilonself.q_table = np.zeros((env.maze.shape[0], env.maze.shape[1], len(env.action_space)))def choose_action(self, state):if np.random.rand() < self.epsilon:return np.random.choice(self.env.action_space)else:return np.argmax(self.q_table[state[0], state[1]])def update(self, state, action, reward, next_state):best_next_action = np.argmax(self.q_table[next_state[0], next_state[1]])td_target = (reward + self.gamma * self.q_table[next_state[0], next_state[1], best_next_action])td_error = td_target - self.q_table[state[0], state[1], action]self.q_table[state[0], state[1], action] += self.alpha * td_errordef train(self, episodes=1000):for episode in range(episodes):state = self.env.reset()done = Falsewhile not done:action = self.choose_action(state)next_state, reward, done = self.env.step(action)self.update(state, action, reward, next_state)state = next_stateif __name__ == "__main__":# 使用环境和智能体env = SimpleMazeEnv()agent = QLearningAgent(env)# 训练智能体agent.train(episodes=1000)# 测试智能体state = env.reset()done = Falseenv.render()while not done:action = agent.choose_action(state)state, _, done = env.step(action)env.render()print(agent.q_table)