Friday, March 11, 2016

Bandit algorithms 4 - Debugging Bandit Algorithms

-- Monte Carlo Simulations Are Like Unit Tests for Bandit Algorithms

active learning refers to algorithms that actively select which data they should receive; 
online learning, which refers to algorithms that analyze data in real-time and provide results on the fly.

-- Simulating the Arms of a Bandit Problem

Optimizing click-through rates for ads: Every time we show someone an ad, we’ll imagine that there’s a fixed probability that they’ll click on the ad. The bandit algorithm will then estimate this probability and try to decide on a strategy for showing ads that maximizes the click-through rate.

Conversion rates for new users: Every time a new visitor comes to our site who isn’t already a registered user, we’ll imagine that there’s a fixed probability that they’ll register as a user after seeing the landing page. We’ll then estimate this probability and try to decide on a strategy for maximizing our conversion rate.

class BernoulliArm():
  def __init__(self, p):
    self.p = p

  def draw(self):
    if random.random() > self.p:
      return 0.0
    else:
      return 1.0

means = [0.1, 0.1, 0.1, 0.1, 0.9]
n_arms = len(means)
random.shuffle(means)
arms = map(lambda (mu): BernoulliArm(mu), means)

arms[0].draw()
arms[1].draw()
arms[2].draw()
arms[2].draw()
arms[3].draw()
arms[2].draw()
arms[4].draw()

>>> arms[0].draw()
1.0
>>> arms[1].draw()
0.0
>>> arms[2].draw()
0.0
>>> arms[2].draw()
0.0
>>> arms[3].draw()
0.0
>>> arms[2].draw()
0.0
>>> arms[4].draw()
0.0


def test_algorithm(algo, arms, num_sims, horizon):
  chosen_arms = [0.0 for i in range(num_sims * horizon)]
  rewards = [0.0 for i in range(num_sims * horizon)]
  cumulative_rewards = [0.0 for i in range(num_sims * horizon)]
  sim_nums = [0.0 for i in range(num_sims * horizon)]
  times = [0.0 for i in range(num_sims * horizon)]

  for sim in range(num_sims):
    sim = sim + 1
    algo.initialize(len(arms))

    for t in range(horizon):
      t = t + 1
      index = (sim - 1) * horizon + t - 1
      sim_nums[index] = sim
      times[index] = t

      chosen_arm = algo.select_arm()
      chosen_arms[index] = chosen_arm

      reward = arms[chosen_arms[index]].draw()
      rewards[index] = reward

      if t == 1:
        cumulative_rewards[index] = reward
      else:
        cumulative_rewards[index] = cumulative_rewards[index - 1] + reward

      algo.update(chosen_arm, reward)

  return [sim_nums, times, chosen_arms, rewards, cumulative_rewards]


execfile("core.py")

import random

random.seed(1)
means = [0.1, 0.1, 0.1, 0.1, 0.9]
n_arms = len(means)
random.shuffle(means)
arms = map(lambda (mu): BernoulliArm(mu), means)
print("Best arm is " + str(ind_max(means)))

f = open("algorithms/epsilon_greedy/standard_results.tsv", "w")

for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5]:
  algo = EpsilonGreedy(epsilon, [], [])
  algo.initialize(n_arms)
  results = test_algorithm(algo, arms, 5000, 250)
  for i in range(len(results[0])):
      f.write(str(epsilon) + "\t")
      f.write("\t".join([str(results[j][i]) for j in range(len(results))]) + "\n")

f.close()

-- Analyzing Results from a Monte Carlo Study

Approach 1: Track the Probability of Choosing the Best Arm

Approach 2: Track the Average Reward at Each Point in Time


Approach 3: Track the Cumulative Reward at Each Point in Time

No comments:

Post a Comment

Blog Archive