[ View Thread ] [ Post Response ] [ Return to Index ] [ Read Prev Msg ] [ Read Next Msg ]

BGonline.org Forums

AlphaZero pseudocode now available

Posted By: Timothy Chow
Date: Sunday, 9 December 2018, at 4:09 a.m.


The AlphaZero team has published their work in Science and have even gone so far as to release some pseudocode (see below). Perhaps some enterprising programmer will now attempt to create AlphaZero backgammon?

 


"""Pseudocode description of the AlphaZero algorithm."""


from __future__ import google_type_annotations
from __future__ import division

import math
import numpy
import tensorflow as tf
from typing import List

##########################
####### Helpers ##########


class AlphaZeroConfig(object):

def __init__(self):
### Self-Play
self.num_actors = 5000

self.num_sampling_moves = 30
self.max_moves = 512 # for chess and shogi, 722 for Go.
self.num_simulations = 800

# Root prior exploration noise.
self.root_dirichlet_alpha = 0.3 # for chess, 0.03 for Go and 0.15 for shogi.
self.root_exploration_fraction = 0.25

# UCB formula
self.pb_c_base = 19652
self.pb_c_init = 1.25

### Training
self.training_steps = int(700e3)
self.checkpoint_interval = int(1e3)
self.window_size = int(1e6)
self.batch_size = 4096

self.weight_decay = 1e-4
self.momentum = 0.9
# Schedule for chess and shogi, Go starts at 2e-2 immediately.
self.learning_rate_schedule = {
0: 2e-1,
100e3: 2e-2,
300e3: 2e-3,
500e3: 2e-4
}


class Node(object):

def __init__(self, prior: float):
self.visit_count = 0
self.to_play = -1
self.prior = prior
self.value_sum = 0
self.children = {}

def expanded(self):
return len(self.children) > 0

def value(self):
if self.visit_count == 0:
return 0
return self.value_sum / self.visit_count


class Game(object):

def __init__(self, history=None):
self.history = history or []
self.child_visits = []
self.num_actions = 4672 # action space size for chess; 11259 for shogi, 362 for Go

def terminal(self):
# Game specific termination rules.
pass

def terminal_value(self, to_play):
# Game specific value.
pass

def legal_actions(self):
# Game specific calculation of legal actions.
return []

def clone(self):
return Game(list(self.history))

def apply(self, action):
self.history.append(action)

def store_search_statistics(self, root):
sum_visits = sum(child.visit_count for child in root.children.itervalues())
self.child_visits.append([
root.children[a].visit_count / sum_visits if a in root.children else 0
for a in range(self.num_actions)
])

def make_image(self, state_index: int):
# Game specific feature planes.
return []

def make_target(self, state_index: int):
return (self.terminal_value(state_index % 2),
self.child_visits[state_index])

def to_play(self):
return len(self.history) % 2


class ReplayBuffer(object):

def __init__(self, config: AlphaZeroConfig):
self.window_size = config.window_size
self.batch_size = config.batch_size
self.buffer = []

def save_game(self, game):
if len(self.buffer) > self.window_size:
self.buffer.pop(0)
self.buffer.append(game)

def sample_batch(self):
# Sample uniformly across positions.
move_sum = float(sum(len(g.history) for g in self.buffer))
games = numpy.random.choice(
self.buffer,
size=self.batch_size,
p=[len(g.history) / move_sum for g in self.buffer])
game_pos = [(g, numpy.random.randint(len(g.history))) for g in games]
return [(g.make_image(i), g.make_target(i)) for (g, i) in game_pos]


class Network(object):

def inference(self, image):
return (-1, {}) # Value, Policy

def get_weights(self):
# Returns the weights of this network.
return []


class SharedStorage(object):

def __init__(self):
self._networks = {}

def latest_network(self) -> Network:
if self._networks:
return self._networks[max(self._networks.iterkeys())]
else:
return make_uniform_network() # policy -> uniform, value -> 0.5

def save_network(self, step: int, network: Network):
self._networks[step] = network


##### End Helpers ########
##########################


# AlphaZero training is split into two independent parts: Network training and
# self-play data generation.
# These two parts only communicate by transferring the latest network checkpoint
# from the training to the self-play, and the finished games from the self-play
# to the training.
def alphazero(config: AlphaZeroConfig):
storage = SharedStorage()
replay_buffer = ReplayBuffer(config)

for i in range(config.num_actors):
launch_job(run_selfplay, config, storage, replay_buffer)

train_network(config, storage, replay_buffer)

return storage.latest_network()


##################################
####### Part 1: Self-Play ########


# Each self-play job is independent of all others; it takes the latest network
# snapshot, produces a game and makes it available to the training job by
# writing it to a shared replay buffer.
def run_selfplay(config: AlphaZeroConfig, storage: SharedStorage,
replay_buffer: ReplayBuffer):
while True:
network = storage.latest_network()
game = play_game(config, network)
replay_buffer.save_game(game)


# Each game is produced by starting at the initial board position, then
# repeatedly executing a Monte Carlo Tree Search to generate moves until the end
# of the game is reached.
def play_game(config: AlphaZeroConfig, network: Network):
game = Game()
while not game.terminal() and len(game.history) < config.max_moves:
action, root = run_mcts(config, game, network)
game.apply(action)
game.store_search_statistics(root)
return game


# Core Monte Carlo Tree Search algorithm.
# To decide on an action, we run N simulations, always starting at the root of
# the search tree and traversing the tree according to the UCB formula until we
# reach a leaf node.
def run_mcts(config: AlphaZeroConfig, game: Game, network: Network):
root = Node(0)
evaluate(root, game, network)
add_exploration_noise(config, root)

for _ in range(config.num_simulations):
node = root
scratch_game = game.clone()
search_path = [node]

while node.expanded():
action, node = select_child(config, node)
scratch_game.apply(action)
search_path.append(node)

value = evaluate(node, scratch_game, network)
backpropagate(search_path, value, scratch_game.to_play())
return select_action(config, game, root), root


def select_action(config: AlphaZeroConfig, game: Game, root: Node):
visit_counts = [(child.visit_count, action)
for action, child in root.children.iteritems()]
if len(game.history) < config.num_sampling_moves:
_, action = softmax_sample(visit_counts)
else:
_, action = max(visit_counts)
return action


# Select the child with the highest UCB score.
def select_child(config: AlphaZeroConfig, node: Node):
_, action, child = max((ucb_score(config, node, child), action, child)
for action, child in node.children.iteritems())
return action, child


# The score for a node is based on its value, plus an exploration bonus based on
# the prior.
def ucb_score(config: AlphaZeroConfig, parent: Node, child: Node):
pb_c = math.log((parent.visit_count + config.pb_c_base + 1) /
config.pb_c_base) + config.pb_c_init
pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)

prior_score = pb_c * child.prior
value_score = child.value()
return prior_score + value_score


# We use the neural network to obtain a value and policy prediction.
def evaluate(node: Node, game: Game, network: Network):
value, policy_logits = network.inference(game.make_image(-1))

# Expand the node.
node.to_play = game.to_play()
policy = {a: math.exp(policy_logits[a]) for a in game.legal_actions()}
policy_sum = sum(policy.itervalues())
for action, p in policy.iteritems():
node.children[action] = Node(p / policy_sum)
return value


# At the end of a simulation, we propagate the evaluation all the way up the
# tree to the root.
def backpropagate(search_path: List[Node], value: float, to_play):
for node in search_path:
node.value_sum += value if node.to_play == to_play else (1 - value)
node.visit_count += 1


# At the start of each search, we add dirichlet noise to the prior of the root
# to encourage the search to explore new actions.
def add_exploration_noise(config: AlphaZeroConfig, node: Node):
actions = node.children.keys()
noise = numpy.random.gamma(config.root_dirichlet_alpha, 1, len(actions))
frac = config.root_exploration_fraction
for a, n in zip(actions, noise):
node.children[a].prior = node.children[a].prior * (1 - frac) + n * frac


######### End Self-Play ##########
##################################

##################################
####### Part 2: Training #########


def train_network(config: AlphaZeroConfig, storage: SharedStorage,
replay_buffer: ReplayBuffer):
network = Network()
optimizer = tf.train.MomentumOptimizer(config.learning_rate_schedule,
config.momentum)
for i in range(config.training_steps):
if i % config.checkpoint_interval == 0:
storage.save_network(i, network)
batch = replay_buffer.sample_batch()
update_weights(optimizer, network, batch, config.weight_decay)
storage.save_network(config.training_steps, network)


def update_weights(optimizer: tf.train.Optimizer, network: Network, batch,
weight_decay: float):
loss = 0
for image, (target_value, target_policy) in batch:
value, policy_logits = network.inference(image)
loss += (
tf.losses.mean_squared_error(value, target_value) +
tf.nn.softmax_cross_entropy_with_logits(
logits=policy_logits, labels=target_policy))

for weights in network.get_weights():
loss += weight_decay * tf.nn.l2_loss(weights)

optimizer.minimize(loss)


######### End Training ###########
##################################

################################################################################
############################# End of pseudocode ################################
################################################################################


# Stubs to make the typechecker happy, should not be included in pseudocode
# for the paper.
def softmax_sample(d):
return 0, 0


def launch_job(f, *args):
f(*args)


def make_uniform_network():
return Network()



Messages In This Thread

 

Post Response

Your Name:
Your E-Mail Address:
Subject:
Message:

If necessary, enter your password below:

Password:

 

 

[ View Thread ] [ Post Response ] [ Return to Index ] [ Read Prev Msg ] [ Read Next Msg ]

BGonline.org Forums is maintained by Stick with WebBBS 5.12.