-- Monte Carlo Simulations Are Like Unit Tests for Bandit Algorithms
active learning refers to algorithms that actively select which data they should receive;
online learning, which refers to algorithms that analyze data in real-time and provide results on the fly.
-- Simulating the Arms of a Bandit Problem
Optimizing click-through rates for ads: Every time we show someone an ad, we’ll imagine that there’s a fixed probability that they’ll click on the ad. The bandit algorithm will then estimate this probability and try to decide on a strategy for showing ads that maximizes the click-through rate.
Conversion rates for new users: Every time a new visitor comes to our site who isn’t already a registered user, we’ll imagine that there’s a fixed probability that they’ll register as a user after seeing the landing page. We’ll then estimate this probability and try to decide on a strategy for maximizing our conversion rate.
class BernoulliArm():
def __init__(self, p):
self.p = p
def draw(self):
if random.random() > self.p:
return 0.0
else:
return 1.0
means = [0.1, 0.1, 0.1, 0.1, 0.9]
n_arms = len(means)
random.shuffle(means)
arms = map(lambda (mu): BernoulliArm(mu), means)
arms[0].draw()
arms[1].draw()
arms[2].draw()
arms[2].draw()
arms[3].draw()
arms[2].draw()
arms[4].draw()
>>> arms[0].draw()
1.0
>>> arms[1].draw()
0.0
>>> arms[2].draw()
0.0
>>> arms[2].draw()
0.0
>>> arms[3].draw()
0.0
>>> arms[2].draw()
0.0
>>> arms[4].draw()
0.0
def test_algorithm(algo, arms, num_sims, horizon):
chosen_arms = [0.0 for i in range(num_sims * horizon)]
rewards = [0.0 for i in range(num_sims * horizon)]
cumulative_rewards = [0.0 for i in range(num_sims * horizon)]
sim_nums = [0.0 for i in range(num_sims * horizon)]
times = [0.0 for i in range(num_sims * horizon)]
for sim in range(num_sims):
sim = sim + 1
algo.initialize(len(arms))
for t in range(horizon):
t = t + 1
index = (sim - 1) * horizon + t - 1
sim_nums[index] = sim
times[index] = t
chosen_arm = algo.select_arm()
chosen_arms[index] = chosen_arm
reward = arms[chosen_arms[index]].draw()
rewards[index] = reward
if t == 1:
cumulative_rewards[index] = reward
else:
cumulative_rewards[index] = cumulative_rewards[index - 1] + reward
algo.update(chosen_arm, reward)
return [sim_nums, times, chosen_arms, rewards, cumulative_rewards]
execfile("core.py")
import random
random.seed(1)
means = [0.1, 0.1, 0.1, 0.1, 0.9]
n_arms = len(means)
random.shuffle(means)
arms = map(lambda (mu): BernoulliArm(mu), means)
print("Best arm is " + str(ind_max(means)))
f = open("algorithms/epsilon_greedy/standard_results.tsv", "w")
for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5]:
algo = EpsilonGreedy(epsilon, [], [])
algo.initialize(n_arms)
results = test_algorithm(algo, arms, 5000, 250)
for i in range(len(results[0])):
f.write(str(epsilon) + "\t")
f.write("\t".join([str(results[j][i]) for j in range(len(results))]) + "\n")
f.close()
-- Analyzing Results from a Monte Carlo Study
Approach 1: Track the Probability of Choosing the Best Arm
Approach 2: Track the Average Reward at Each Point in Time
Approach 3: Track the Cumulative Reward at Each Point in Time
No comments:
Post a Comment