Machine Learning and Deep Learning: Simulation with Python

Code from Chapter 2 of “Machine Learning and Deep Learning: Simulation with Python” has been rewritten for clarity.

Environment

Python 3.6.5 Anaconda

Problem Setup

This is the learning program for maze navigation in section 2.2.3.

Similar to the book, we store Q-values in a list (q_value), which includes 1 value for the first level $(=2^0)$ , 2 values for the second level $(=2^1)$ , 4 values for the third level $(=2^2)$ , and 8 values for the fourth level $(=2^3)$ , totaling 15 values in a single list.

List Index Configuration

Level 1 (Initial State)	0
Level 2	1				2
Level 3	3		4		5		6
Level 4	7	8	9	10	11	12	13	14

Additionally, as in the book, we set the index 14 as the goal. This time, we store the index of the goal in a constant variable called GOAL_POSITION. It’s also possible to experiment with different goal positions by changing this value.

Code

We’ve made some changes to variable names and conditional statements to make it more understandable. While it’s possible to make the code more flexible for different hierarchical structures, we haven’t done that to avoid increasing the code complexity.

We’ve introduced a constant variable THRESHOLD. If the Q-values don’t change for this number of iterations, we consider the learning to have converged. We then output the iteration index at which it first reached the convergence value. The number of iterations to converge depends on the values of ALPHA, GAMMA, and EPSILON.

import math
import random
ALPHA = 0.1 # learning rate
GAMMA = 0.9
EPSILON = 0.8 # randomize value
LEVEL = 3
GOAL_POSITION = 14
THRESHOLD = 100
def select_a(previous_position, q_value):
"""
Select activity
"""
if random.random() < EPSILON:
if random.randint(0, 1) == 0:
return 2 * previous_position + 1
else:
if q_value[2 * previous_position + 1] > 
q_value[2 * previous_position + 2]:
return 2 * previous_position + 1
return 2 * previous_position + 2
def update_q(position, q_value):
"""
Calculate updated Q Value
"""
if position < 2 ** LEVEL - 1:
# if not in last level
qmax = max(
q_value[2 * position + 1],
q_value[2 * position + 2])
return q_value[position] + int(
ALPHA * (GAMMA * qmax - q_value[position]))
# if in last level
if position == GOAL_POSITION:
return q_value[position] + int(
ALPHA * (1000 - q_value[position]))
#elif position == 11:
#    return q_value[position] + int(
#        ALPHA * (500 - q_value[position]))
return q_value[position]
def solve(seed, time):
random.seed(seed)
q_value = [
random.randint(0, 100) for i in
range(2 ** (LEVEL + 1) - 1)
]
convergence_time = 0
eq_count = 0
print(q_value)
for i in range(time):
previous_q_value = q_value
position = 0  # initial state
for j in range(3):
position = select_a(position, q_value)
q_value[position] = update_q(position, q_value)
print(q_value)
if q_value == previous_q_value:
if convergence_time == 0:
convergence_time = i
eq_count += 1
if eq_count > THRESHOLD:
print(convergence_time)
break
else:
eq_count = 0
convergence_time = 0
if __name__ == "__main__":
solve(32767, 1000)

import math

import random

ALPHA = 0.1 # learning rate

GAMMA = 0.9

EPSILON = 0.8 # randomize value

LEVEL = 3

GOAL_POSITION = 14

THRESHOLD = 100

def select_a(previous_position, q_value):

"""

Select activity

"""

if random.random() < EPSILON:

if random.randint(0, 1) == 0:

return 2 * previous_position + 1

else:

if q_value[2 * previous_position + 1] >

q_value[2 * previous_position + 2]:

return 2 * previous_position + 1

return 2 * previous_position + 2

def update_q(position, q_value):

"""

Calculate updated Q Value

"""

if position < 2 ** LEVEL - 1:

# if not in last level

qmax = max(

q_value[2 * position + 1],

q_value[2 * position + 2])

return q_value[position] + int(

ALPHA * (GAMMA * qmax - q_value[position]))

# if in last level

if position == GOAL_POSITION:

return q_value[position] + int(

ALPHA * (1000 - q_value[position]))

#elif position == 11:

# return q_value[position] + int(

# ALPHA * (500 - q_value[position]))

return q_value[position]

def solve(seed, time):

random.seed(seed)

q_value = [

random.randint(0, 100) for i in

range(2 ** (LEVEL + 1) - 1)

]

convergence_time = 0

eq_count = 0

print(q_value)

for i in range(time):

previous_q_value = q_value

position = 0 # initial state

for j in range(3):

position = select_a(position, q_value)

q_value[position] = update_q(position, q_value)

print(q_value)

if q_value == previous_q_value:

if convergence_time == 0:

convergence_time = i

eq_count += 1

if eq_count > THRESHOLD:

print(convergence_time)

break

else:

eq_count = 0

convergence_time = 0

if __name__ == "__main__":

solve(32767, 1000)

Readers who viewed this page, also viewed:

目薬に含まれる成分の効果

The Life

Machine Learning and Deep Learning: Simulation with Python – qlearning.py