diff --git a/main.ipynb b/main.ipynb index ed19025..f0a9cf1 100644 --- a/main.ipynb +++ b/main.ipynb @@ -8,10 +8,10 @@ "\n", "The game reversi is a very good game to apply deep learning methods to.\n", "\n", - "Othello also known as reversi is a board game first published in 1883 by eiter Lewis Waterman or John W. Mollet in England (each one was denouncing the other as fraud).\n", - "It is a strickt turn based zero-sum game with a clear Markov chain and now hidden states like in card games with an unknown distribution of cards or unknown player allegiance.\n", - "There is like for the game go only one set of stones with two colors which is much easier to abstract than chess with its 6 unique pieces.\n", - "The game has a symmetrical game board wich allows to play with rotating the state around an axis to allow for a breaking of sequences or interesting ANN architectures, quadruple the data generation by simulation or interesting test cases where a symetry in turns should be observable if the AI reaches an \"objective\" policy." + "Othello, also known as reversi, is a board game first published in 1883 by either Lewis Waterman or John W. Mollet in England (each one was denouncing the other as fraud).\n", + "It is a strict turn-based zero-sum game with a clear Markov chain and no hidden states, unlike card games with an unknown distribution of cards or unknown player allegiance.\n", + "The game is played with one set of stones with two colors, which is much easier to abstract than chess with its 6 unique pieces.\n", + "The game board is symmetrical and allows for playing with rotating the state around an axis or flipping/mirroring the board, which can allow for a breaking of sequences or interesting ANN architectures, quadruple the data generation by simulation, or interesting test cases where symmetry in turns should be observable if the AI reaches an \"objective\" policy." ] }, { @@ -717,9 +717,11 @@ "\n", "\n", "@np_cache(maxsize=2000, array_argument_elements=(0, 1))\n", - "def _get_possible_turns_for_board(board: np.ndarray, poss_turns: np.ndarray) -> np.ndarray:\n", + "def _get_possible_turns_for_board(\n", + " board: np.ndarray, poss_turns: np.ndarray\n", + ") -> np.ndarray:\n", " \"\"\"Calcualtes where turns are possible.\n", - " \n", + "\n", " Args:\n", " board: The board that should be checked for a playable action.\n", " poss_turns: An array of actions that could be possible. All true fileds are empty and next to an enemy stone.\n", @@ -1681,15 +1683,17 @@ " )\n", " ax2.scatter(turn, mean_possibility_count[turn], marker=\"x\")\n", " ax2.legend()\n", - " \n", + "\n", " action_space_cumprod = np.cumprod(_mean_possibility_count[::-1], axis=0)[::-1]\n", " ax4.plot(range(70), action_space_cumprod)\n", - " \n", + "\n", " ax4.scatter(turn, action_space_cumprod[turn], marker=\"x\")\n", " ax4.set_yscale(\"log\", base=10)\n", " ax4.set_xlabel(\"Turn\")\n", " ax4.set_ylabel(\"Mean remaining total action space size\")\n", - " ax4.set_title(f\"Remaining action space at {turn} = {action_space_cumprod[turn].round():.2E}\")\n", + " ax4.set_title(\n", + " f\"Remaining action space at {turn} = {action_space_cumprod[turn].round():.2E}\"\n", + " )\n", " fig.delaxes(ax3)\n", " fig.tight_layout()\n", " plt.show()" @@ -1870,7 +1874,7 @@ "source": [ "def history_changed(board_history: np.ndarray) -> np.ndarray:\n", " \"\"\"Calculates if the board changed between actions.\n", - " \n", + "\n", " Args:\n", " board_history: A history of game baords. Shaped (70 * n * 8 * 8)\n", " \"\"\"\n", @@ -1925,10 +1929,8 @@ " assert len(board_history.shape) == 4\n", " assert board_history.shape[-2:] == (8, 8)\n", " assert board_history.shape[0] == SIMULATE_TURNS\n", - " return (\n", - " pd.Series(\n", - " [count_unique_boards(board_history[turn]) for turn in range(SIMULATE_TURNS)]\n", - " )\n", + " return pd.Series(\n", + " [count_unique_boards(board_history[turn]) for turn in range(SIMULATE_TURNS)]\n", " )\n", "\n", "\n", @@ -2039,16 +2041,27 @@ " score[player_2_won] = -score2_final\n", " return score\n", "\n", + "\n", "np.random.seed(2)\n", "_baords = simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0]\n", - "np.testing.assert_array_equal(np.sum(_baords[-1], axis=(1,2)), final_boards_evaluation(_baords[-1]))\n", + "np.testing.assert_array_equal(\n", + " np.sum(_baords[-1], axis=(1, 2)), final_boards_evaluation(_baords[-1])\n", + ")\n", "np.random.seed(2)\n", - "np.testing.assert_array_equal(np.array([ -6., -36., -12., -16., 38., -12., 2., -22., 2., 10.]), final_boards_evaluation(simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0][-1]))\n", + "np.testing.assert_array_equal(\n", + " np.array([-6.0, -36.0, -12.0, -16.0, 38.0, -12.0, 2.0, -22.0, 2.0, 10.0]),\n", + " final_boards_evaluation(\n", + " simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0][-1]\n", + " ),\n", + ")\n", "\n", "np.random.seed(2)\n", "boards = simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0][-1]\n", "boards[:, 4, :] = 0\n", - "np.testing.assert_array_equal(np.array([-14., -38., -14., -22., 40., -16., -14., -28., 0., 20.]), final_boards_evaluation(boards))\n", + "np.testing.assert_array_equal(\n", + " np.array([-14.0, -38.0, -14.0, -22.0, 40.0, -16.0, -14.0, -28.0, 0.0, 20.0]),\n", + " final_boards_evaluation(boards),\n", + ")\n", "\n", "_boards = get_new_games(EXAMPLE_STACK_SIZE)\n", "%timeit final_boards_evaluation(_boards)" @@ -2064,16 +2077,20 @@ "source": [ "def calculate_final_evaluation_for_history(board_history: np.ndarray) -> np.ndarray:\n", " \"\"\"Calculates the final scores for a stack of game histories.\n", - " \n", + "\n", " Args:\n", " board_history: A stack of game histories.\n", " \"\"\"\n", " final_evaluation = final_boards_evaluation(board_history[-1])\n", " return final_evaluation / 64\n", "\n", + "\n", "np.random.seed(2)\n", "_boards = simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0]\n", - "np.testing.assert_array_equal(np.array([ -6., -36., -12., -16., 38., -12., 2., -22., 2., 10.]) / 64, calculate_final_evaluation_for_history(_boards))" + "np.testing.assert_array_equal(\n", + " np.array([-6.0, -36.0, -12.0, -16.0, 38.0, -12.0, 2.0, -22.0, 2.0, 10.0]) / 64,\n", + " calculate_final_evaluation_for_history(_boards),\n", + ")" ] }, { @@ -2140,7 +2157,7 @@ " Returns:\n", " the combined score for both players.\n", " \"\"\"\n", - " assert boards.shape[-2:] == (8,8)\n", + " assert boards.shape[-2:] == (8, 8)\n", " return np.sum(boards, axis=(-1, -2))\n", "\n", "\n", @@ -2159,10 +2176,10 @@ " (70, 10),\n", ")\n", "np.random.seed(3)\n", - "np.testing.assert_array_equal(evaluate_boards(simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0][:4, :3]), np.array([[0, 0, 0],\n", - " [3, 3, 3],\n", - " [0, 0, 0],\n", - " [5, 3, 3]]))\n", + "np.testing.assert_array_equal(\n", + " evaluate_boards(simulate_game(10, (RandomPolicy(1), RandomPolicy(1)))[0][:4, :3]),\n", + " np.array([[0, 0, 0], [3, 3, 3], [0, 0, 0], [5, 3, 3]]),\n", + ")\n", "\n", "_boards = get_new_games(EXAMPLE_STACK_SIZE)\n", "%timeit evaluate_boards(_boards)" @@ -2273,7 +2290,8 @@ "plt.title(\"Win distribution\")\n", "plt.bar(\n", " [\"black\", \"draw\", \"white\"],\n", - " pd.Series(calculate_who_won(_board_history)).value_counts().sort_index() / _board_history.shape[1],\n", + " pd.Series(calculate_who_won(_board_history)).value_counts().sort_index()\n", + " / _board_history.shape[1],\n", ")\n", "plt.show()" ] @@ -2298,8 +2316,8 @@ "outputs": [], "source": [ "def calculate_direct_score(board_history: np.ndarray) -> np.ndarray:\n", - " \"\"\"Calcualtes the delta score for all actions.\n", - " \n", + " \"\"\"Calculates the delta score for all actions.\n", + "\n", " Args:\n", " board_history: A history of board games or a stack of board games. Shaped (70 * n * 8 * 8)\n", " \"\"\"\n", @@ -2329,7 +2347,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When ploting the direct score it can be easily seen that the later turnse are point whise more importent. A bad opening however will not alow the player to keep those points. But it is easy to see that points not made at the begining of the game can be made at the end of the game. This allows for concentration on the gameplay and some preperation at the start of the game." + "When plotting the direct score it can be easily seen that the later turnse are point-wise more important. A bad opening however will not allow the player to keep those points. But it is easy to see that points not made at the beginning of the game can be made at the end of the game. This allows for concentration on the gameplay and some preparation at the start of the game." ] }, { @@ -2372,7 +2390,7 @@ " f\"Histogram of scores changes on turn {turn} by {'white' if turn % 2 == 0 else 'black'}\"\n", " )\n", " score = score_history[turn]\n", - " bins = max(1, int(max(score) - min(score)) )\n", + " bins = max(1, int(max(score) - min(score)))\n", " ax1.hist(score, density=True, bins=bins)\n", " ax1.set_xlabel(\"Points made\")\n", " ax1.set_ylabel(\"Score probability\")\n", @@ -2392,47 +2410,30 @@ " plt.show()" ] }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": { - "pycharm": { - "is_executing": true - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Curating Q-Learing requirements\n", + "## Creating Q-Learning Policies\n", + "Q-learning is a classic reinforcement learning technique. The Q-function is an action-value function that returns the expected value of an action in a given state.\n", + "\n", + "$Q^\\pi(s_t,a_t)=\\sum^{60}_{t=turn}\\gamma^{60-t} \\cdot R_t$\n", + "\n", + "With this function, all actions in a given state can be evaluated, and the most beneficial action can be taken. With classical reinforcement learning, a table for situations and actions is explored and slowly filled. With ANNs, there is the possibility to use an AI model that can interpolate between situations and should not need to explore the complete game tree to solve some situations.\n", + "\n", "### Calculating discount tables\n", "\n", - "Since the a game stack is contains all steps even if no action is possible this needs to be corrected.\n", - "The normal formula for a reword is:\n", + "Since the game stack contains all steps, even if no action is possible, this needs to be corrected. The normal formula for a reward is:\n", "\n", - "$E(s_{turn},a_{turn}) = \\prod_{t=turn}^{70}\\gamma_t$\n", + "$E(s_{turn},a_{turn}) = \\sum^{60}_{t=turn}\\gamma^{60-t} \\cdot R_t$\n", "\n", - "Since turns that can't be taken do not have the element of uncertanty the discountation has to be excluded by setting the value to $1$ instead of $\\gamma$.\n", + "Since turns that can't be taken do not have the element of uncertainty, the discounting has to be excluded by setting the value to $1$ instead of $\\gamma$.\n", "\n", - "$q_t =\\begin{cases}1 & |a_t|=0\\\\\\gamma & |a_t|>0\\end{cases}$\n", + "$\\gamma^*_t =\\begin{cases}1 & |a_t|=0\\\\gamma & |a_t|>0\\end{cases}$\n", "\n", - "$E(s_{turn},a_{turn}) = \\prod_{t=turn}^{70}q_t$\n", + "$E(s_{turn},a_{turn}) = \\prod_{t=turn}^{70}\\gamma^*_t \\cdot R_t$\n", "\n", - "The table below contains the aggregated discount factors for each reword fitting to the state history." + "The table below contains the aggregated discount factors ($\\prod_{t=turn}^{70}\\gamma^*_t$) for each reward fitting to the state history. This setup also allows to reward the certainty gained by taking the choice of the action from the opponent. It can be argued that also all turns where a player had no choice how to act should not be discounted. But this will increase calculation requirements to nearly double, which is currently not acceptable since computation time and code complexity are bottlenecks." ] }, { @@ -2447,7 +2448,7 @@ "source": [ "def get_gamma_table(board_history: np.ndarray, gamma_value: float) -> np.ndarray:\n", " \"\"\"Calculates a discount table for a board history.\n", - " \n", + "\n", " Args:\n", " board_history: A history of game boards. Shaped (70 * n * 8 * 8)\n", " gamma_value: The default discount factor.\n", @@ -2505,14 +2506,14 @@ "def calculate_q_reword(\n", " board_history: np.ndarray,\n", " who_won_fraction: float = 0.2,\n", - " final_score_fraction: float=0.2,\n", - " gamma: float=0.8,\n", + " final_score_fraction: float = 0.2,\n", + " gamma: float = 0.8,\n", ") -> np.ndarray:\n", " \"\"\"\n", - " \n", + "\n", " Args:\n", - " board_history: \n", - " who_won_fraction: \n", + " board_history:\n", + " who_won_fraction:\n", " final_score_fraction:\n", " gamma:\n", " \"\"\"\n", @@ -3761,19 +3762,21 @@ ] }, { - "cell_type": "raw", - "metadata": { - "tags": [] - }, + "cell_type": "code", + "execution_count": null, + "outputs": [], "source": [ "probes: int = 1000\n", "_ = (\n", - " calculate_board_branching(simulate_game(probes, (ql_policy, ql_policy))[0]) / probes\n", + " calculate_board_branching(simulate_game(probes, (ql_policy1, ql_policy1))[0]) / probes\n", ").plot(\n", " ylim=(0, 1),\n", - " title=f\"Branching rate for a QL policy with epsilon={ql_policy.epsilon}\",\n", + " title=f\"Branching rate for a QL policy with epsilon={ql_policy1.epsilon}\",\n", ")" - ] + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", @@ -3815,240 +3818,11 @@ } ], "source": [ + "constant_metric_policies = [RandomPolicy(0), GreedyPolicy(0)]\n", "for i in range(100):\n", " for ql_policy in ql_policys:\n", " ql_policy.load()\n", - " ql_policy.train(1, 10, 1000, 250, [RandomPolicy(0), GreedyPolicy(0)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "ql_policy.load()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "ql_policy.plot_history()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "ql_policy.train(100, 10, 1000, 250, [RandomPolicy(0), GreedyPolicy(0)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "_boards_greedy_l, _action_greedy_l = simulate_game(\n", - " 500, (RandomPolicy(0), GreedyPolicy(0)), tqdm_on=True\n", - ")\n", - "_boards_greedy_r, _action_greedy_r = simulate_game(\n", - " 500, (GreedyPolicy(0), RandomPolicy(0)), tqdm_on=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "_boards_greedy_r.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "np.sum(_boards_greedy_l[-1]) / 500, -np.sum(_boards_greedy_r[-1]) / 500" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "ql_policy._epsilon = 1\n", - "_boards_l, _actions_l = simulate_game(500, (RandomPolicy(0), ql_policy), tqdm_on=True)\n", - "_boards_r, _actions_r = simulate_game(500, (ql_policy, RandomPolicy(0)), tqdm_on=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "_boards_l.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "np.sum(_boards_l[-1]) / 500, -np.sum(_boards_r[-1]) / 500" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "_boards_policy_l, _ = simulate_game(500, (ql_policy, GreedyPolicy(0)), tqdm_on=True)\n", - "_boards_policy_r, _ = simulate_game(500, (GreedyPolicy(0), ql_policy), tqdm_on=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "np.sum(_boards_policy_l[-1]) / 500, np.sum(_boards_policy_r[-1]) / 500" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "calculate_final_evaluation_for_history(\n", - " _boards_policy_l\n", - ").mean() * 64, calculate_final_evaluation_for_history(_boards_policy_r).mean() * 64" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "@interact(game_start=\"0\")\n", - "def plot_training(game_start: int) -> None:\n", - " boards_at_once = 12\n", - " if not game_start:\n", - " return\n", - " game_start = int(game_start)\n", - " start = game_start * boards_at_once\n", - " end = start + boards_at_once\n", - " boards_selected = _boards_l[start:end, 0]\n", - " scores_selected = _boards_r[start:end, 0]\n", - "\n", - " # noinspection PyProtectedMember\n", - " p_scores = np.max(\n", - " ql_policy._internal_policy(_boards[start:end, 0].cpu().detach().numpy()),\n", - " axis=(1, 2),\n", - " ).tolist()\n", - "\n", - " scores2 = np.array(\n", - " [\n", - " f\"Q:{float(score[0]):2e}@P:{float(score[1]):2e}\"\n", - " for score in zip(scores_selected, p_scores)\n", - " ]\n", - " )\n", - " plot_othello_boards(\n", - " boards_selected,\n", - " scores=scores2,\n", - " )" + " ql_policy.train(1, 10, 1000, 250, constant_metric_policies)" ] }, {