diff --git a/main.ipynb b/main.ipynb index 4e13d9c..792eacd 100644 --- a/main.ipynb +++ b/main.ipynb @@ -95,6 +95,7 @@ "metadata": {}, "outputs": [], "source": [ + "from multiprocessing import Pool\n", "\n", "%load_ext blackcellmagic" ] @@ -152,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +163,8 @@ "EXAMPLE_STACK_SIZE: Final[int] = 1000 # defines the game stack size for examples\n", "IMPOSSIBLE: Final[np.ndarray] = np.array([-1, -1], dtype=int)\n", "IMPOSSIBLE.setflags(write=False)\n", - "SIMULATE_TURNS: Final[int] = 70" + "SIMULATE_TURNS: Final[int] = 70\n", + "VERIFY_POLICY: Final[bool] = True" ] }, { @@ -454,22 +456,22 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "9.43 ms ± 1 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", - "1 s ± 179 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "9.31 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "831 ms ± 25.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] }, { "data": { "text/plain": "array([[[False, False, False, False, False, False, False, False],\n [False, False, False, False, False, False, False, False],\n [False, False, False, True, False, False, False, False],\n [False, False, True, False, False, False, False, False],\n [False, False, False, False, False, True, False, False],\n [False, False, False, False, True, False, False, False],\n [False, False, False, False, False, False, False, False],\n [False, False, False, False, False, False, False, False]]])" }, - "execution_count": 23, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -558,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -593,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -664,15 +666,15 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "177 µs ± 3.97 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", - "29.7 µs ± 106 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", - "31.2 µs ± 269 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + "172 µs ± 7.68 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", + "29.9 µs ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", + "31.6 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], @@ -748,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -760,13 +762,13 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "95.1 ms ± 3.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "89.4 ms ± 3.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] }, { @@ -874,7 +876,7 @@ "## An abstract reversi game policy\n", "\n", "For an easy use of policies an abstract class containing the policy generation / requests an action in an inherited instance of this class.\n", - "This class filters the policy to only propose valid actions. Inherited instance do not need to care about this." + "This class filters the policy to only propose valid actions. Inherited instance do not need to care about this. This super class also manges exploration and exploitation with the epsilon value." ], "metadata": { "collapsed": false @@ -882,7 +884,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -891,6 +893,20 @@ " A game policy. Proposes where to place a stone next.\n", " \"\"\"\n", "\n", + " def __init__(self, epsilon: float):\n", + " \"\"\"\n", + "\n", + " Args:\n", + " epsilon: the epsilon / greedy value. Should be between zero and one. Set the mixture of policy and exploration. One means only the policy is used. Zero means only random policies are used. All mixtures inbetween between are possible.\n", + " \"\"\"\n", + " if 0 > epsilon > 1:\n", + " raise ValueError(\"Epsilon should be between zero and one.\")\n", + " self._epsilon: float = epsilon\n", + "\n", + " @property\n", + " def epsilon(self):\n", + " return self._epsilon\n", + "\n", " @property\n", " @abc.abstractmethod\n", " def policy_name(self) -> str:\n", @@ -909,39 +925,179 @@ " \"\"\"\n", " raise NotImplementedError()\n", "\n", - " def get_policy(\n", - " self, boards: np.ndarray, epsilon: float = 1\n", - " ) -> tuple[np.ndarray, np.ndarray]:\n", + " def get_policy(self, boards: np.ndarray) -> np.ndarray:\n", + " \"\"\"Calculates the policy that should be followed.\n", + "\n", + " Calculates the policy that should be followed.\n", + " This function does include the usage of epsilon to configure greediness and exploration.\n", + "\n", + " Args:\n", + " boards: A set of boards that show the environment where the policy should be calculated for.\n", + "\n", + " Returns:\n", + " A vector of indices. Should be formatted as an array of the form [x, y]. The value [-1, -1] is expected if no turn is possible.\n", + " \"\"\"\n", " assert len(boards.shape) == 3\n", - " assert boards.shape == (BOARD_SIZE, BOARD_SIZE)\n", + " assert boards.shape[1:] == (BOARD_SIZE, BOARD_SIZE)\n", "\n", - " # todo possibly change this function to only validate the purpose turn and\n", + " if self.epsilon <= 0:\n", + " policies = np.random.rand(*boards.shape)\n", + " else:\n", + " policies = self._internal_policy(boards)\n", + " if self.epsilon < 1:\n", + " policies = policies * self.epsilon + np.random.rand(*boards.shape) * (\n", + " 1 - self.epsilon\n", + " )\n", "\n", - " policies = self._internal_policy(boards)\n", - " raw_policy = policies.copy()\n", - " if epsilon < 1:\n", - " policies = policies + np.random.rand(*boards.shape)\n", - "\n", - " # todo talk to team about backpropagation epsilon for greedy factor\n", + " # todo talk to team about backpropagation of score and epsilon for greedy factor\n", "\n", + " # todo possibly change this function to only validate the purpose turn and not all turns\n", " possible_turns = get_possible_turns(boards)\n", " policies[possible_turns == False] = -1.0\n", " max_indices = [\n", " np.unravel_index(policy.argmax(), policy.shape) for policy in policies\n", " ]\n", " policy_vector = np.array(max_indices)\n", - " max_policy = policy_vector\n", + " no_turn_possible_1 = np.all(policy_vector == 0, 1)\n", + " zero_pos = policies[:, 0, 0] == -1.0\n", " no_turn_possible = np.all(policy_vector == 0, 1) & (policies[:, 0, 0] == -1.0)\n", "\n", - " policy_vector[no_turn_possible] = IMPOSSIBLE\n", - " max_policy[no_turn_possible] = 0\n", - " return policy_vector, raw_policy" + " policy_vector[no_turn_possible, :] = IMPOSSIBLE\n", + " return policy_vector" ] }, { "cell_type": "markdown", "source": [ - "## A first policy" + "## A first policy\n", + "\n", + "To quantify the quality of a game AI there needs to be some benchmarks.\n", + "The easiest benchmark is to play against a random player.\n", + "The easiest player to use as a benchmark is the random player.\n", + "For this and testing purpose the random policy was implemented." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [], + "source": [ + "class RandomPolicy(GamePolicy):\n", + " \"\"\"\n", + " A policy playing a random turn by setting epsilon to 0.\n", + " \"\"\"\n", + "\n", + " def __init__(self, epsilon: float):\n", + " _ = epsilon\n", + " super().__init__(epsilon=0)\n", + "\n", + " @property\n", + " def policy_name(self) -> str:\n", + " return \"random\"\n", + "\n", + " def _internal_policy(self, boards: np.ndarray) -> np.ndarray:\n", + " pass\n", + "\n", + "\n", + "rnd_policy = RandomPolicy(1)\n", + "assert rnd_policy.policy_name == \"random\"\n", + "assert rnd_policy.epsilon == 0\n", + "\n", + "rnd_policy_result = rnd_policy.get_policy(get_new_games(10))\n", + "assert np.any((5 >= rnd_policy_result) & (rnd_policy_result >= 3))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Putting the game simulation together\n", + "Now it's time to bring all together for a proper simulation." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Playing a single turn\n", + "\n", + "The next function needed is used to request a policy, verify that the turn is legit and place a stone and turn enemy stones if possible." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.02 s ± 58.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "949 ms ± 43.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def single_turn(\n", + " current_boards: np, policy: GamePolicy\n", + ") -> tuple[np.ndarray, np.ndarray]:\n", + " \"\"\"Execute a single turn on a board.\n", + "\n", + " Places a new stone on the board. Turns captured enemy stones.\n", + "\n", + " Args:\n", + " current_boards: The current board before the game.\n", + " policy: The game policy to be used.\n", + "\n", + " Returns:\n", + " The new game board and the policy vector containing the index of the action used.\n", + " \"\"\"\n", + " policy_results = policy.get_policy(current_boards)\n", + "\n", + " # if the constant VERIFY_POLICY is set to true the policy is verified. Should be good though.\n", + " # todo deactivate the policy verification after some testing.\n", + " if VERIFY_POLICY:\n", + " assert np.all(moves_possible(current_boards, policy_results)), (\n", + " current_boards[(moves_possible(current_boards, policy_results) == False)],\n", + " policy_results[(moves_possible(current_boards, policy_results) == False)],\n", + " np.where(moves_possible(current_boards, policy_results) == False),\n", + " )\n", + " return do_moves(current_boards, policy_results), policy_results\n", + "\n", + "\n", + "%timeit single_turn(get_new_games(EXAMPLE_STACK_SIZE), RandomPolicy(1))\n", + "VERIFY_POLICY = False # type: ignore\n", + "%timeit single_turn(get_new_games(EXAMPLE_STACK_SIZE), RandomPolicy(1))\n", + "VERIFY_POLICY = True # type: ignore\n", + "plot_othello_boards(\n", + " single_turn(get_new_games(EXAMPLE_STACK_SIZE), RandomPolicy(1))[0][:8]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Simulate a stack of games\n", + "This function will simulate a stack of games and return an array of policies and histories." ], "metadata": { "collapsed": false @@ -951,63 +1107,58 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception in thread Thread-5 (_handle_workers):\n", + "Traceback (most recent call last):\n", + " File \"C:\\Program Files\\Python310\\lib\\threading.py\", line 1016, in _bootstrap_inner\n", + " self.run()\n", + " File \"C:\\Program Files\\Python310\\lib\\threading.py\", line 953, in run\n", + " self._target(*self._args, **self._kwargs)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\pool.py\", line 516, in _handle_workers\n", + " cls._maintain_pool(ctx, Process, processes, pool, inqueue,\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\pool.py\", line 340, in _maintain_pool\n", + " Pool._repopulate_pool_static(ctx, Process, processes, pool,\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\pool.py\", line 329, in _repopulate_pool_static\n", + " w.start()\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\process.py\", line 121, in start\n", + " self._popen = self._Popen(self)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\context.py\", line 336, in _Popen\n", + " return Popen(process_obj)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\popen_spawn_win32.py\", line 93, in __init__\n", + " reduction.dump(process_obj, to_child)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\reduction.py\", line 60, in dump\n", + " ForkingPickler(file, protocol).dump(obj)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\synchronize.py\", line 104, in __getstate__\n", + " h = context.get_spawning_popen().duplicate_for_child(sl.handle)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\popen_spawn_win32.py\", line 99, in duplicate_for_child\n", + " return reduction.duplicate(handle, self.sentinel)\n", + " File \"C:\\Program Files\\Python310\\lib\\multiprocessing\\reduction.py\", line 79, in duplicate\n", + " return _winapi.DuplicateHandle(\n", + "PermissionError: [WinError 5] Zugriff verweigert\n" + ] + } + ], "source": [ - "class RandomPolicy(GamePolicy):\n", - " @property\n", - " def policy_name(self) -> str:\n", - " return \"random\"\n", - "\n", - " def internal_policy(self, boards: np.ndarray) -> np.ndarray:\n", - " random_values = np.random.rand(*boards.shape)\n", - " return random_values\n", - " # return np.argmax(random_values, (1, 2))\n", - "\n", - "\n", - "rnd_policy = RandomPolicy()\n", - "assert rnd_policy.policy_name == \"random\"\n", - "rnd_policy_result = rnd_policy.get_policy(get_new_games(1))\n", - "assert np.any((5 >= rnd_policy_result) & (rnd_policy_result >= 3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def single_turn(\n", - " current_boards: np, policy: GamePolicy\n", - ") -> tuple[np.ndarray, np.ndarray]:\n", - " policy_results = policy.get_policy(current_boards)\n", - "\n", - " assert np.all(moves_possible(current_boards, policy_results)), (\n", - " current_boards[(moves_possible(current_boards, policy_results) == False)],\n", - " policy_results[(moves_possible(current_boards, policy_results) == False)],\n", - " np.where(moves_possible(current_boards, policy_results) == False),\n", - " )\n", - "\n", - " return do_moves(current_boards, policy_results), policy_results\n", - "\n", - "\n", - "%timeit single_turn(get_new_games(100), RandomPolicy())\n", - "single_turn(get_new_games(100), RandomPolicy())[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", + "from tqdm.notebook import tqdm\n", "\n", "\n", "def simulate_game(\n", " nr_of_games: int,\n", " policies: tuple[GamePolicy, GamePolicy],\n", ") -> tuple[np.ndarray, np.ndarray]:\n", + " \"\"\"Simulates a stack of games.\n", "\n", + " Args:\n", + " nr_of_games: The number of games that should be simulated.\n", + " policies: The policies that should be used to simulate the game.\n", + "\n", + " Returns:\n", + " A stack of board histories and actions.\n", + " \"\"\"\n", " board_history_stack = np.zeros((SIMULATE_TURNS, nr_of_games, 8, 8))\n", " action_history_stack = np.zeros((SIMULATE_TURNS, nr_of_games, 2))\n", " current_boards = get_new_games(nr_of_games)\n", @@ -1026,21 +1177,88 @@ " return board_history_stack, action_history_stack\n", "\n", "\n", - "%timeit simulate_game(100, (RandomPolicy(), RandomPolicy()))\n", - "simulate_game(10, (RandomPolicy(), RandomPolicy()))" + "simulation_results = simulate_game(1, (RandomPolicy(1), RandomPolicy(1)))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, "outputs": [], - "source": [] + "source": [ + "\n", + "%timeit simulate_game(100, (RandomPolicy(1), RandomPolicy(1)))\n", + "# simulate_game(EXAMPLE_STACK_SIZE, (RandomPolicy(1), RandomPolicy(1)))" + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "outputs": [], + "source": [ + "policies_to_use = RandomPolicy(1), RandomPolicy(1)\n", + "with Pool(3) as pool:\n", + " results = pool.map(simulate_game, [100, policies_to_use])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "is_executing": true + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "a = np.array(\n", + " [\n", + " [\n", + " [-1, -1, -1, -1, 0, 0, 0, 0],\n", + " [1, 1, -1, 1, 1, 0, 0, 0],\n", + " [1, 1, -1, 1, 1, 1, 0, 0],\n", + " [0, 1, -1, 1, 1, 1, 0, 0],\n", + " [0, 1, 1, 1, 1, 1, 0, 0],\n", + " [-1, 1, 1, 1, 1, 0, 0, 0],\n", + " [0, 0, 0, 1, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0],\n", + " ]\n", + " ],\n", + " dtype=int,\n", + ")\n", + "a" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "is_executing": true + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "RandomPolicy(1).get_policy(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "is_executing": true + } + }, "outputs": [], "source": [ "import numpy as np\n", @@ -1291,7 +1509,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": true + } + }, "outputs": [], "source": [ "plot_othello_boards(create_test_game()[-3:])" @@ -1300,7 +1522,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": true + } + }, "outputs": [], "source": [ "array = create_test_game()" @@ -1321,7 +1547,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": true + } + }, "outputs": [], "source": [] }