感觉就是动态规划在决策方向的具体化实现。
    也是从后向前推导,后面的最有状态加上转移奖赏,然后选择最优状态。
    人工手动确定的就是各个数值和状态之间的切换奖赏数值(根据实际设定)。
    image.png

    1. import numpy
    2. import sys
    3. def markov_decision_process(o, g, h, s, T, f, p, test=False):
    4. def cal_transition_function():
    5. """
    6. Return P where P[before_sold][after_sold] is the probablity of given before_sold products at the begining of the month, there is after_sold products left at the end if the month. See table 1
    7. """
    8. if test:
    9. return numpy.array(
    10. [
    11. [1, 0, 0, 0],
    12. [3 / 4, 1 / 4, 0, 0],
    13. [1 / 4, 1 / 2, 1 / 4, 0],
    14. [0, 1 / 4, 1 / 2, 1 / 4]
    15. ]
    16. )
    17. P = numpy.zeros((s + 1, s + 1))
    18. # First dimension is products amount before sold
    19. for before_sold in range(s + 1):
    20. # Senond dimension is products amount after sold
    21. # But we just loop the demand for convenient, and add the probablity to correspond element in P
    22. for demand in range(s + 1):
    23. after_sold = 0
    24. if (demand >= before_sold):
    25. after_sold = 0
    26. else:
    27. after_sold = before_sold - demand
    28. P[before_sold][after_sold] += p[demand]
    29. return P
    30. P = cal_transition_function()
    31. print(f'Transition probablity P[before_sold][after_sold] =\n{P}\n')
    32. def cal_reward_matrix():
    33. """
    34. Return R where R[current_stock][action] is the reward got if the decision maker take action `action` when current stock is `current_stock`. See table 2
    35. """
    36. if test:
    37. return numpy.array(
    38. [
    39. [0, -1, -2, -5],
    40. [5, 0, -3, 0],
    41. [6, -1, 0, 0],
    42. [5, 0, 0, 0]
    43. ]
    44. )
    45. def F(stock):
    46. """
    47. Return corresponding F accoring f and p. See equation (14)
    48. """
    49. reward = 0
    50. for demand in range(s + 1):
    51. if demand <= stock:
    52. reward += p[demand] * f(demand)
    53. else:
    54. # Sold out all stock
    55. reward += p[demand] * f(stock)
    56. return reward
    57. R = numpy.zeros((s + 1, s + 1))
    58. # First dimension is products amount before taking action
    59. for before_action in range(s + 1):
    60. # Second dimension is the action's inbound products amount
    61. for inbound in range(s + 1):
    62. # no need to calculate ❌ in table 2
    63. if (before_action + inbound > s):
    64. continue
    65. R[before_action][inbound] = F(before_action + inbound) - o(inbound) - h(before_action + inbound)
    66. return R
    67. R = cal_reward_matrix()
    68. print(f'Reward matrix R[current_stock][action] =\n{R}\n')
    69. def cal_cumulative_reward():
    70. """
    71. Return a tuple (u, a), See equation (12)
    72. where u[time][current_stock] is the cumulative maximum reward when there are `current_stock` stock at time `time`.
    73. and where a[time][current_stock] is the best action to take when stock amount is `current_stock` at time `time`.
    74. """
    75. u = numpy.zeros((T + 1, s + 1))
    76. a = numpy.zeros((T + 1, s + 1))
    77. # First dimension is time, in range [0, 1, ..., T]
    78. # We should initialize u[T] first
    79. for stock in range(s + 1):
    80. u[T][stock] = g(T)
    81. # Then we begin to get u[T-1], u[T-2], ..., u[0]
    82. for t in range(T - 1, -1, -1):
    83. print('Calculating time:', t)
    84. # Second dimension is current stock
    85. for current_stock in range(s + 1):
    86. print('\t if current stock is:', current_stock)
    87. best_action = 0
    88. maximum_reward = 0
    89. # For all actions, we calculate a cumulative reward, and then take the maxmimum, see equation (16)
    90. for inbound in range(0, s + 1 - current_stock):
    91. # reward from current_stock to next_stock
    92. reward = R[current_stock][inbound]
    93. # reward after current_stock
    94. for next_stock in range(s + 1):
    95. reward += P[current_stock + inbound][next_stock] * u[t + 1][next_stock]
    96. print('\t\t if take action:', inbound, 'the cumulative reward will be:', reward)
    97. if reward > maximum_reward:
    98. maximum_reward = reward
    99. best_action = inbound
    100. u[t][current_stock] = maximum_reward
    101. a[t][current_stock] = best_action
    102. print('\t so we choose action:', best_action, 'and will get max reward:', maximum_reward)
    103. print('\n')
    104. return (u, a)
    105. (u, a) = cal_cumulative_reward()
    106. print(f'Cumulative maximum reward u[time][current_stock] =\n{u}\n')
    107. print(f'Best action a[time][current_stock] = \n{a}\n')
    108. def run_example(T=3, test=False):
    109. # Model parameters, see equation (13)
    110. def o(u):
    111. return 2 * u;
    112. def g(u):
    113. return 0;
    114. def h(u):
    115. return u;
    116. def f(u):
    117. return 8 * u;
    118. s = 3
    119. # p_0, p_1, ..., p_s-1, q_s
    120. # p_0, p_1, p_2, p_3 for this example
    121. p = [1 / 4, 1 / 2, 1 / 4, 0]
    122. markov_decision_process(o, g, h, s, T, f, p, test=test)
    123. T=10
    124. run_example(T=T, test=False)