%% LyX 1.1 created this file.  For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[english]{amsart}
\usepackage[T1]{fontenc}
\usepackage[latin1]{inputenc}
\usepackage{babel}
\usepackage{graphics}
\usepackage{amssymb}

\makeatletter

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
\providecommand{\LyX}{L\kern-.1667em\lower.25em\hbox{Y}\kern-.125emX\@}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass specific LaTeX commands.
 \theoremstyle{plain}    
 \newtheorem{thm}{Theorem}[section]
 \numberwithin{equation}{section} %% Comment out for sequentially-numbered
 \numberwithin{figure}{section} %% Comment out for sequentially-numbered
 \theoremstyle{plain}    
 \newtheorem{cor}[thm]{Corollary} %%Delete [thm] to re-start numbering
 \theoremstyle{plain}    
 \newtheorem{lem}[thm]{Lemma} %%Delete [thm] to re-start numbering
 \theoremstyle{definition}
 \newtheorem{defn}[thm]{Definition}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
\usepackage{icml2002}
\usepackage{epsf}

\renewcommand{\baselinestretch}{0.95}

\makeatother
\begin{document}
\twocolumn[
\icmltitle{Approximately Optimal Approximate Reinforcement Learning}

\icmlauthor{Sham Kakade}{sham@gatsby.ucl.ac.uk}
\icmladdress{Gatsby Computational Neuroscience Unit, UCL,
             London WC1N 3AR, UK}
\icmlauthor{John Langford}{jcl@cs.cmu.edu}
\icmladdress{Computer Science Department, Carnegie-Mellon University, 5000 Forbes Avenue, Pittsburgh, PA 15217}
\vskip 0.3in
]


\begin{abstract}
In order to solve realistic reinforcement learning problems, it is
critical that approximate algorithms be used. In this paper, we present
the \emph{conservative policy iteration} algorithm which finds an
{}``approximately{}'' optimal policy, given access to a \emph{restart
distribution} (which draws the next state from a particular distribution)
and an approximate \emph{greedy policy chooser}. Crudely, the greedy
policy chooser outputs a policy that usually chooses actions with
the largest state-action values of the current policy, \emph{ie} it
outputs an {}``approximate{}'' greedy policy. This greedy policy
chooser can be implemented using standard value function approximation
techniques. Under these assumptions, our algorithm: (1) is guaranteed
to improve a performance metric (2) is guaranteed to terminate in
a {}``small{}'' number of timesteps and (3) returns an {}``approximately{}''
optimal policy. The quantified statements of (2) and (3) depend on
the quality of the greedy policy chooser, but \emph{not} explicitly
on the the size of the state space. 
\end{abstract}


\section{Introduction}

The two standard approaches, greedy dynamic programming and policy
gradient methods, have enjoyed many empirical successes on reinforcement
learning problems. Unfortunately, both methods can fail to efficiently
improve the policy. Approximate value function methods suffer from
a lack of strong theoretical performance guarantees. We show how policy
gradient techniques can require an unreasonably large number of samples
in order to determine the gradient accurately. This is due to policy
gradient method's fundamental intertwining of {}``exploration{}''
and {}``exploitation{}''.

In this paper, we consider a setting in which our algorithm is given
access to a \emph{restart distribution} and an {}``approximate{}''
\emph{greedy policy chooser.} Informally, the \emph{restart distribution}
allows the agent to obtain its next state from a fixed distribution
of our choosing. Through a more uniform restart distribution, the
agent can gain information about states that it wouldn't necessarily
visit otherwise. Also informally, the greedy policy chooser is a {}``black
box{}'' that outputs a policy that \emph{on average} chooses actions
with large advantages with respect to the current policy, \emph{ie}
it provides an {}``approximate{}'' greedy policy. This {}``black
box{}'' algorithm can be implemented using one of the well-studied
\emph{regression algorithms} for value functions (see \cite{sutton-book,neuro-programming}).
The quality of the resulting greedy policy chooser is then related
to the quality of this {}``black box{}''.

Drawing upon the strengths of the standard approaches, we propose
the \emph{conservative policy iteration} algorithm. The key ingredients
of this algorithm are: (1) the policy is improved in a more uniform
manner over the state-space and (2) a more conservative policy update
is performed in which the new policy is a mixture distribution of
the current policy and the greedy policy. Crudely, the importance
of (1) is to incorporate {}``exploration{}'' and the importance
of (2) is to avoid the pitfalls of greedy dynamic programming methods
which can suffer from significant policy degradation by directly using
{}``approximate{}'' greedy policies. Our contribution is in proving
that such an algorithm converges in a {}``small{}'' number of steps
and returns an {}``approximately{}'' optimal policy, where the quantified
claims do \emph{not} explicitly depend on the the size of the state
space. We first review the problems with standard approaches, then
state our algorithm.


\section{Preliminaries}

A finite \textbf{Markov decision process} (MDP) is defined by the
tuple \( (S,D,A,\mathcal{R},\left\{ P(s^{\prime };s,a)\right\} ) \)
where: \( S \) is a finite set of states, \( D \) is the starting
state distribution, \( A \) is a finite set of actions, \( \mathcal{R} \)
is a reward \textbf{}function \( \mathcal{R}:S\times A\rightarrow [0,R] \),
and \( \left\{ P(s^{\prime };s,a)\right\}  \) are the transition
\textbf{}probabilities, with \( P(s^{\prime };s,a) \) giving the
next-state distribution upon taking action \( a \) in state \( s \). 

Although ultimately we desire an algorithm which uses only the given
MDP \( M \), we assume access to a restart distribution, defined
as follows: 

\begin{defn}
\label{simulator}A \textbf{\( \mu  \) restart distribution} draws
the next state from the distribution \( \mu  \).
\end{defn}
This restart distribution is a slightly weaker version of the \emph{generative
model} in \cite{kearns99sparse}. As in \cite{kearns99sparse}, our
assumption is considerably weaker than having knowledge of the full
transition model. However, it is a much stronger assumption than having
only {}``irreversible{}'' experience, in which the agent must follow
a single trajectory, with no ability to reset to obtain another trajectory
from a state. If \( \mu  \) is chosen to be a relatively uniform
distribution (not necessarily \( D \)), then this \( \mu  \) restart
distribution can obviate the need for explicit exploration.

The agent's decision making procedure is characterized by a stochastic
policy \( \pi (a;s) \), which is the probability of taking action
\( a \) in state \( s \) (where the semi-colon is used to distinguish
the parameters from the random variables of the distribution). We
only consider the case where the goal of the agent is to maximize
the \( \gamma  \)-discounted average reward from the starting state
distribution \( D \), though this has a similar solution to maximizing
the average reward for processes that {}``mix{}'' on a reasonable
timescale \cite{Sham_colt}. Given \( 0\leq \gamma <1 \), we define
the \textbf{value function} for a given policy \( \pi  \) as \[
V_{\pi }(s)\equiv (1-\gamma )E\left[ \sum _{t=0}^{\infty }\gamma ^{t}\mathcal{R}(s_{t},a_{t})|\pi ,s\right] \]
where \( s_{t} \) and \( a_{t} \) are random variables for the state
and action at time \( t \) upon executing the policy \( \pi  \)
from the starting state \( s \) (see \cite{puterman-book} for a
formal definition of this expectation). Note that we are using \emph{normalized}
values so \( V_{\pi }(s)\in [0,R] \). For a given policy \( \pi  \),
we define the \textbf{state-action value} as \[
Q_{\pi }(s,a)\equiv (1-\gamma )\mathcal{R}(s,a)+\gamma E_{s^{\prime }\sim P(s^{\prime };s,a)}\left[ V_{\pi }(s^{\prime })\right] \]
and as in \cite{sutton99policy} (much as in \cite{baird93advantage}),
we define the \textbf{advantage} as\[
A_{\pi }(s,a)\equiv Q_{\pi }(s,a)-V_{\pi }(s)\]
Again, both \( Q_{\pi }(s,a)\in [0,R] \) and \( A_{\pi }(s,a)\in [-R,R] \)
due to normalization. 

It is convenient to define the \textbf{\( \gamma  \)-discounted future
state distribution} (as in \cite{sutton99policy}) for a starting
state distribution \( \mu  \) as 

\begin{equation}
\label{state_dist.}
d_{\pi ,\mu }(s)\equiv (1-\gamma )\sum _{t=0}^{\infty }\gamma ^{t}\Pr (s_{t}=s;\pi ,\mu )
\end{equation}
where the \( 1-\gamma  \) is necessary for normalization. We abuse
notation and write \( d_{\pi ,s} \) for the discounted future-state
distribution with respect to the distribution which deterministically
starts from state \( s \). Note that \( V_{\pi }(s)=E_{(a^{\prime },s^{\prime })\sim \pi d_{\pi ,s}}\left[ \mathcal{R}(s^{\prime },a^{\prime })\right]  \).
This distribution is analogous to the stationary distribution in the
undiscounted setting, since as \( \gamma \rightarrow 1 \), \( d_{\pi ,s} \)
tends to the stationary distribution for all \( s \), if one such
exists. 

The goal of the agent is to maximize the discounted reward from the
start state distribution \( D \),\[
\eta _{D}(\pi )\equiv E_{s\sim D}\left[ V_{\pi }(s)\right] \, .\]
Note that \( \eta _{D}(\pi )=E_{(a,s)\sim \pi d_{\pi ,D}}\left[ \mathcal{R}(s,a)\right]  \).
A well known result is that a policy exists which simultaneously maximizes
\( V_{\pi }(s) \) for all states. 


\section{The Problems with Current Methods}

We now examine in more detail the problems with approximate value
function methods and policy gradient methods. There are three questions
to which we desire answers to:

\begin{enumerate}
\item Is there some performance measure that is guaranteed to improve at
every step?
\item How difficult is it to verify if a particular update improves this
measure?
\item After a reasonable number of policy updates, what performance level
is obtained?
\end{enumerate}
We now argue that both greedy dynamic programming and policy gradient
methods give unsatisfactory answers to these questions. Note that
we did not ask is {}``What is the quality of the asymptotic policy?{}''.
We are only interested in policies that we can find in a reasonable
amount of time. Understanding the problems with these current methods
gives insight into our new algorithm, which addresses these three
questions. 


\subsection{Approximate Value Function Methods}

Exact value function methods, such as policy iteration, typically
work in an iterative manner. Given a policy \( \pi  \), policy iteration
calculates the state-action value \( Q_{\pi }(s,a) \), and then creates
a new deterministic policy \( \pi ^{\prime }(a;s) \) such that \( \pi ^{\prime }(a;s)=1 \)
iff \( a\in \textrm{argmax}_{a}Q_{\pi }(s,a) \). This process is
repeated until the state-action values converge to their optimal values.
These exact value function methods have strong bounds showing how
fast the values converge to optimal (see \cite{puterman-book}).

Approximate value function methods typically use approximate estimates
of the state-action values in an exact method. These methods suffer
from a paucity of theoretical results on the performance of a policy
based on the approximate values. This leads to weak answers to all
three questions.

Let us consider some function approximator \( \widetilde{V}(s) \)
with the \( l_{\infty } \)-error:\[
\varepsilon =\max _{s}|\widetilde{V}(s)-V_{\pi }(s)|\]
where \( \pi  \) is some policy. Let \( \pi ^{\prime } \) be a greedy
policy based on this approximation. We have the following guarantee
(see \cite{neuro-programming}) for all states \( s \):\begin{equation}
\label{greedy_pi}
V_{\pi ^{\prime }}(s)\geq V_{\pi }(s)-\frac{2\gamma \varepsilon }{1-\gamma }.
\end{equation}
In other words, the performance is guaranteed to not decrease \emph{}by
more than \( \frac{2\gamma \varepsilon }{1-\gamma } \). Question
2 is not applicable since these methods don't guarantee improvement
and a performance measure to check isn't well defined. 

For approximate methods, the time required to obtain some performance
level is not well understood. Some convergence and asymptotic results
exist (see \cite{neuro-programming}). 


\subsection{Policy Gradients Methods}

Direct policy gradient methods attempt to find a good policy among
some restricted class of policies, by following the gradient of the
future reward. Given some parameterized class \( \{\pi _{\theta }|\theta \in \mathcal{R}^{m}\} \),
these methods compute the gradient

\begin{equation}
\label{policy_grad}
\nabla \eta _{D}=\sum _{s,a}d_{\pi ,D}(s)\nabla \pi (a;s)Q_{\pi }(s,a)
\end{equation}
 (as shown in \cite{sutton99policy}). 

For policy gradient techniques, question 1 has the appealing answer
that the performance measure of interest is guaranteed to improve
under gradient ascent. We now address question 2 by examining the
situations in which estimating the gradient direction is difficult.
We show that the lack of exploration in gradient methods translates
into requiring a large number of samples in order to accurately estimate
the gradient \emph{direction}.

Consider the simple MDP shown in Figure \ref{fig1} (adapted from
\cite{thrun92efficient}). Under a policy that gives equal probability
to all actions, the expected time to the goal from the left most state
is \( 3(2^{n}-n-1) \), and with \( n=50 \), the expected time to
the goal is about \( 10^{15} \). This MDP falls in the class of MDPs
in which random actions are more likely than not to increase the distance
to the goal state. For these classes of problems (see \cite{whitehead91complexity}),
the expected time to reach the goal state using undirected exploration,
\emph{ie} random walk exploration, is exponential in the size of the
state space. Thus, any \char`\"{}on-policy\char`\"{} method has to
run for at least this long before any policy improvement can occur.
In online value function methods, this problem is seen as a lack of
exploration. 


\begin{figure}
\resizebox*{0.45\textwidth}{!}{\includegraphics{figs/thrun_game.eps}} 


\caption{\label{fig1}Example 1: Two actions move agent to the left and one
actions moves agent to the right.}
\end{figure}


Any sensible estimate of the gradient without reaching the goal state
would be zero, and obtaining non-zero estimates requires exponential
time with \char`\"{}on-policy\char`\"{} samples. Importance sampling
methods do exist (see \cite{meuleau01exploration}), but are not feasible
solutions for this class of problems. The reason is that if the agent
could follow some {}``off-policy{}'' trajectory to reach the goal
state in a reasonable amount of time, the importance weights would
have to be exponentially large. 

Note that a zero estimate is a rather accurate estimate of the gradient
in terms of magnitude, but this provides no information about direction,
which is the crucial quantity of interest. The analysis in \cite{bartlett00estimation}
suggests a relatively small sample size is needed to accurately estimate
the magnitude (within some \( \varepsilon  \) tolerance), though
this does not imply an accurate direction if the gradient is small.
Unfortunately, the magnitude of the gradient can be very small when
the policy is far from optimal.


\begin{figure}
{\centering \resizebox*{0.07\textwidth}{!}{\includegraphics{figs/twomdp.eps}} \resizebox*{0.3\textwidth}{!}{\includegraphics{figs/fig_score.eps}} \par}


\caption{\label{fig-mdp} Example 2: A two state MDP and the average reward
vs. time (on a \protect\( 10^{7}\protect \) scale) of a policy under
standard gradient descent in the limit of an infinitesimally small
learning rate (initial conditions stated in text). }
\end{figure}

\begin{figure}
\resizebox*{0.3\textwidth}{!}{\includegraphics{figs/fig_stat.eps}} 


\caption{\label{fig2} The stationary probability (for figure \ref{fig-mdp})
of state \protect\( j\protect \) vs. time (on a \protect\( 10^{7}\protect \)
scale). }
\end{figure}


Let us give an additional example demonstrating the problems for the
simple two state MDP shown in Figure \ref{fig-mdp}, which uses the
common Gibbs table-lookup distributions, \( \{\pi _{\theta }:\pi (a;s)\propto \exp (\theta _{sa})\} \).
Increasing the chance of a self-loop at \( i \) decreases the stationary
probability of \( j \), which hinders the learning at state \( j \).
Under an initial policy that has the stationary distribution \( \rho (i)=.8 \)
and \( \rho (j)=.2 \) (using \( \pi (a_{1};i)=.8 \) and \( \pi (a_{1};j)=.9 \)),
learning at state \( i \) reduces the learning at state \( j \)
leading to an an extremely flat plateau of improvement at \( 1 \)
average reward shown in Figure \ref{fig-mdp}. Figure \ref{fig2}
shows that this problem is so severe that \( \rho (j) \) drops as
low as \( 10^{-7} \) from it's initial probability of \( .2 \).
As in example 1, to obtain a nonzero estimate of the gradient it is
necessary to visit state \( j \). The situation could be even worse
with a few extra states in a chain as in figure \ref{fig1}. 

Although asymptotically a good policy might be found, these results
do not bode well for the answer to question 3, which is concerned
with how fast such a policy can be found. These results suggest that
in any reasonable number of steps, a gradient method could end up
being trapped at plateaus where estimating the gradient direction
has an unreasonably large sample complexity. Answering question 3
is crucial to understand how well gradient methods perform, and (to
our knowledge) no such analysis exists.


\section{Approximately Optimal RL}

The fundamental problem with policy gradients is that \( \eta _{D} \),
which is what we ultimately seek to optimize, is insensitive to policy
improvement at unlikely states though policy improvement at these
unlikely states might be necessary in order for the agent to achieve
near optimal payoff. We desire an alternative performance measure
that does not down weight advantages at unlikely states or unlikely
actions. A natural candidate for a performance measure is to weight
the improvement from all states more uniformly (rather than by \( D \)),
such as \[
\eta _{\mu }(\pi )\equiv E_{s\sim \mu }\left[ V_{\pi }(s)\right] \]
 where \( \mu  \) is some {}``exploratory{}'' restart distribution.
Under our assumption of having access to a \( \mu  \)-restart distribution,
we can obtain estimates of \( \eta _{\mu }(\pi ) \).

Any optimal policy simultaneously maximizes both \( \eta _{\mu } \)
and \( \eta _{D} \). Unfortunately, the policy that maximizes \( \eta _{\mu } \)
within some restricted class of policies may have poor performance
according to \( \eta _{D} \), so we must ensure that maximizing \( \eta _{\mu } \)
results in a good policy under \( \eta _{D} \). 

Greedy policy iteration updates the policy to some \( \pi ^{\prime } \)
based on some approximate state-action values. Instead, let us consider
the following more conservative update rule: \begin{equation}
\label{pol_update}
\pi _{\textrm{new}}(a;s)=(1-\alpha )\pi (a;s)+\alpha \pi ^{\prime }(a;s)\, ,
\end{equation}
 for some \( \pi ^{\prime } \) and \( \alpha \in [0,1] \). To guarantee
improvement with \( \alpha =1 \), \( \pi ^{\prime } \) must choose
a better action at every state, or else we could suffer the penalty
shown in equation \ref{greedy_pi}.

In the remainder of this section, we describe a more conservative
policy iteration scheme using \( \alpha <1 \) and state the main
theorems of this paper. In subsection 4.1, we show that \( \eta _{\mu } \)
can improve under the much less stringent condition in which \( \pi ^{\prime } \)
often, but not always, chooses greedy actions. In subsection 4.2,
we assume access to a greedy policy chooser that outputs {}``approximately{}''
greedy policies \( \pi ^{\prime } \) and then bound the performance
of the policy found by our algorithm in terms of the quality of this
greedy policy chooser. 


\subsection{Policy Improvement}

A more reasonable situation is one in which we are able to improve
the policy with some \( \alpha >0 \) using a \( \pi ^{\prime } \)
that chooses better actions at most but not all states. Let us define
the \textbf{policy advantage} \( \mathbb {A}_{\pi ,\mu }(\pi ^{\prime }) \)
of some policy \( \pi ^{\prime } \) with respect to a policy \( \pi  \)
\textbf{}and a distribution \( \mu  \) to be\[
\mathbb {A}_{\pi ,\mu }(\pi ^{\prime })\equiv E_{s\sim d_{\pi ,\mu }}\left[ E_{a\sim \pi ^{\prime }(a;s)}\left[ A_{\pi }(s,a)\right] \right] \, .\]
 The policy advantage measures the degree to which \( \pi ^{\prime } \)
is choosing actions with large advantages, with respect to the set
of states visited under \( \pi  \) starting from a state \( s\sim \mu  \).
Note that a policy found by one step of policy improvement maximizes
the policy advantage. 

It is straightforward to show that \( \frac{\partial \eta _{\mu }}{\partial \alpha }|_{\alpha =0}=\frac{1}{1-\gamma }\mathbb {A}_{\pi ,\mu } \)
(using equation \ref{policy_grad}), so the change in \( \eta _{\mu } \)
is: \begin{equation}
\label{local_improve}
\Delta \eta _{\mu }=\frac{\alpha }{1-\gamma }\mathbb {A}_{\pi ,\mu }(\pi ^{\prime })+O(\alpha ^{2})\, .
\end{equation}
 Hence, for sufficiently small \( \alpha  \), policy improvement
occurs if the policy advantage is \emph{positive}, and at the other
extreme of \( \alpha =1 \), significant degradation could occur.
We now connect these two regimes to determine how much policy improvement
is possible.

\begin{thm}
\label{th_bound} Let \( \mathbb {A} \) be the policy advantage of
\( \pi ^{\prime } \) with respect to \( \pi  \) and \( \mu  \).
and let \( \varepsilon =\max _{s}|E_{a\sim \pi ^{\prime }(a;s)}\left[ A_{\pi }(s,a)\right] | \).
For the update rule \ref{pol_update} and for all \( \alpha \in [0,1] \):\[
\eta _{\mu }(\pi _{\textrm{new}})-\eta _{\mu }(\pi )\geq \frac{\alpha }{1-\gamma }(\mathbb {A}-\frac{2\alpha \gamma \varepsilon }{1-\gamma (1-\alpha )})\, .\]
 
\end{thm}
The proof of this theorem is given in the appendix. It is possible
to construct a two state example showing this bound is tight for all
\( \alpha  \) though we do not provide this example here.

The first term is analogous to the first order increase specified
in equation \ref{local_improve}, and the second term is a penalty
term. Note that if \( \alpha =1 \), the bound reduces to \[
\eta _{\mu }(\pi _{\textrm{new}})-\eta _{\mu }(\pi )\geq \frac{\mathbb {A}}{1-\gamma }-\frac{2\gamma \varepsilon }{1-\gamma }\]
 and the penalty term has the same form as that of greedy dynamic
programming, where \( \varepsilon  \), as defined here, is analogous
to the \( l_{\infty } \) error in equation \ref{greedy_pi}.

The following corollary shows that the greater the policy advantage
the greater the guaranteed performance increase. 

\begin{cor}
\label{th_how_far} Let \( R \) be the maximal possible reward and
\( \mathbb {A} \) be the policy advantage of \( \pi ^{\prime } \)
with respect to \( \pi  \) and \( \mu  \). If \( \mathbb {A}\geq 0 \),
then using \( \alpha =\frac{(1-\gamma )\mathbb {A}}{4R} \) guarantees
the following policy improvement:\[
\eta _{\mu }(\pi _{\textrm{new}})-\eta _{\mu }(\pi )\geq \frac{\mathbb {A}^{2}}{8R}\, .\]

\end{cor}
\begin{proof}
Using the previous theorem, it is straightforward to show the change
is bounded by \( \frac{\alpha }{1-\gamma }(\mathbb {A}-\alpha \frac{2R}{1-\gamma }) \).
The corollary follows by choosing the \( \alpha  \) that maximizes
this bound.
\end{proof}

\subsection{Answering question 3}

We address question 3 by first addressing how fast we converge to
some policy then bounding the quality of this policy. Naively, we
expect our ability to obtain policies with large advantages to affect
the speed of improvement and the quality of the final policy. Instead
of explicitly suggesting algorithms that find policies with large
policy advantages, we assume access to an \( \varepsilon  \)-greedy
policy chooser that solves this problem. Let us call this \( \varepsilon  \)-good
algorithm \( G_{\varepsilon }(\pi ,\mu ) \), which is defined as:

\begin{defn}
An \textbf{\( \varepsilon  \)-greedy policy chooser}, \( G_{\varepsilon }(\pi ,\mu ) \),
is a function of a policy \( \pi  \) and a state distribution \( \mu  \)
which returns a policy \( \pi ^{\prime } \) such that \( \mathbb {A}_{\pi ,\mu }(\pi ^{\prime })\geq \textrm{OPT}(\mathbb {A}_{\pi ,\mu })-\varepsilon  \),
where \( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })\equiv \max _{\pi ^{\prime }}\mathbb {A}_{\pi ,\mu }(\pi ^{\prime }) \).
\end{defn}
In the discussion, we show that a regression algorithm that fits the
advantages with an \( \frac{\varepsilon }{2} \) average error is
sufficient to construct such a \( G_{\varepsilon } \). 

The {}``breaking point{}'' at which policy improvement is no longer
guaranteed occurs when the greedy policy chooser is no longer guaranteed
to return a policy with a positive policy advantage, \emph{ie} when
\( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })<\varepsilon  \). A crude
outline of the \textbf{Conservative Policy Iteration} algorithm is:

\begin{enumerate}
\item Call \( G_{\varepsilon }(\pi ,\mu ) \) to obtain some \( \pi ^{\prime } \)
\item Estimate the policy advantage \( \mathbb {A}_{\pi ,\mu }(\pi ^{\prime }) \)
\item If this policy advantage is small (less than \( \varepsilon  \)),
STOP.
\item Else, update the policy and go to (1).
\end{enumerate}
The full algorithm is specified in the next section. The following
theorem shows that in polynomial time, the full algorithm finds a
policy \( \pi  \) that is close to the {}``break point{}'' of the
greedy policy chooser. 

\begin{thm}
\label{CPI} With probability at least \( 1-\delta  \), conservative
policy iteration: i) improves \( \eta _{\mu } \) with every policy
update, ii) ceases in at most \( 72\frac{R^{2}}{\varepsilon ^{2}} \)
calls to \( G_{\varepsilon }(\pi ,\mu ) \), and iii) returns a policy
\( \pi  \) such that \( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })<2\varepsilon  \). 
\end{thm}
The proof is in the appendix.

To complete the answer to question 3, we need to address the quality
of the policy found by this algorithm. Note that the bound on the
time until our algorithm ceases does not depend on the restart distribution
\( \mu  \) though the performance of the policy \( \pi  \) that
we find does depend on \( \mu  \) since \( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })<2\varepsilon  \).
Crudely, for a policy to have near optimal performance then all advantages
must be small. Unfortunately, if \( d_{\pi ,\mu } \) is highly non-uniform,
then a small optimal policy advantage does not necessarily imply that
all advantages are small. The following corollary (of theorem \ref{th_quality})
bounds the performance of interest, \( \eta _{D} \), for the policy
found by the algorithm. We use the standard definition of the \( l_{\infty } \)-norm,
\( ||f||_{\infty }\equiv \max _{s}f(s) \).

\begin{cor}
\label{cor-quality} Assume that for some policy \( \pi  \), \( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })<\varepsilon  \).
Let \( \pi ^{*} \) be an optimal policy. Then \begin{eqnarray*}
\eta _{D}(\pi ^{*})-\eta _{D}(\pi )\leq \frac{\varepsilon }{(1-\gamma )}\left\Vert \frac{d_{\pi ^{*},D}}{d_{\pi ,\mu }}\right\Vert _{\infty } &  & \\
\leq \frac{\varepsilon }{(1-\gamma )^{2}}\left\Vert \frac{d_{\pi ^{*},D}}{\mu }\right\Vert _{\infty }\, . &  & 
\end{eqnarray*}

\end{cor}
The factor of \( \left\Vert \frac{d_{\pi ^{*},D}}{d_{\pi ,\mu }}\right\Vert _{\infty } \)
represents a mismatch between the distribution of states of the current
policy and that of the optimal policy and elucidates the problem of
using the given start-state distribution \( D \) instead of a more
uniform distribution. Essentially, a more uniform \( d_{\pi ,\mu } \)
ensures that the advantages are small at states that an optimal policy
visits (determined by \( d_{\pi ^{*},D} \)). The second inequality
follows since \( d_{\pi ,\mu }(s)\geq (1-\gamma )\mu (s) \), and
it shows that a uniform measure prevents this mismatch from becoming
arbitrarily large. 

We now discuss and prove these theorems.


\section{Conservative Policy Iteration}

For simplicity, we assume knowledge of \( \varepsilon  \). The conservative
policy iteration \emph{}algorithm is:

\begin{enumerate}
\item Call \( G_{\varepsilon }(\pi ,\mu ) \) to obtain some \( \pi ^{\prime } \)
\item Use \( O(\frac{R^{2}}{\varepsilon ^{2}}\log \frac{R^{2}}{\delta \varepsilon ^{2}}) \)
\( \mu  \)-restarts to obtain an \( \frac{\varepsilon }{3} \)-accurate
estimate \( \mathbb {\hat{A}} \) of \( \mathbb {A}_{\pi ,\mu }(\pi ^{\prime }) \).
\item If \( \mathbb {\hat{A}}<\frac{2\varepsilon }{3} \), STOP and return
\( \pi  \).
\item If \( \mathbb {\hat{A}}\geq \frac{2\varepsilon }{3} \), then update
policy \( \pi  \) according to equation \ref{pol_update} using \( \frac{(1-\gamma )(\mathbb {\hat{A}}-\frac{\varepsilon }{3})}{4R} \)
and return to step 1. 
\end{enumerate}
where \( \delta  \) is the failure probability of the algorithm.
Note that the estimation procedure of step (2) allows us to set the
learning rate \( \alpha  \). 

We now specify the estimation procedure of step (2) to obtain \( \frac{\varepsilon }{3} \)-accurate
estimates of \( \mathbb {A}_{\pi }(\pi ^{\prime }) \). It is straightforward
to show that the policy advantage can be written as \( \mathbb {A}_{\pi ,\mu }(\pi ^{\prime })=E_{s\sim d_{\pi ,\mu }}\left[ \sum _{a}(\pi ^{\prime }(a;s)-\pi (a;s))Q_{\pi }(s,a)\right]  \).
We can obtain a nearly unbiased estimate \( x_{i} \) of \( \mathbb {A}_{\pi ,\mu }(\pi ^{\prime }) \)
using one call to the \( \mu  \)-restart distribution (see definition
\ref{simulator}). To obtain a sample \( s \) from from \( d_{\pi ,\mu } \),
we obtain a trajectory from \( s_{0}\sim \mu  \) and accept the current
state \( s_{\tau } \) with probability \( (1-\gamma ) \) (see equation
\ref{state_dist.}). Then we chose an action \( a \) from the uniform
distribution, and continue the trajectory from \( s \) to obtain
a nearly unbiased estimate \( \hat{Q}_{\pi }(s,a) \) of \( Q_{\pi }(s,a) \).
Using importance sampling, the nearly unbiased estimate of the policy
advantage from the \( i \)-th sample is \( x_{i}=n_{a}\hat{Q}_{i}(s,a)(\pi ^{\prime }(a;s)-\pi (a;s)) \)
where \( n_{a} \) is the number of actions. We assume that each trajectory
is run sufficiently long such that the bias in \( x_{i} \) is less
than \( \frac{\varepsilon }{6} \). 

Since \( \hat{Q}_{i}\in [0,R] \), our samples satisfy \( x_{i}\in [-n_{a}R,n_{a}R] \).
Using Hoeffding's inequality for \( k \) independent, identically
distributed random variables, we have: \begin{equation}
\label{hoeffding}
\Pr \left( |\mathbb {A}-\mathbb {\hat{A}}|>\Delta \right) \leq 2e^{-\frac{k\Delta ^{2}}{2n_{a}^{2}R^{2}}}\, ,
\end{equation}
 where \( \mathbb {\hat{A}}=\frac{1}{k}\sum _{i=1}^{k}x_{i} \) (and
\( \mathbb {A} \) here is \( \frac{\varepsilon }{6} \) biased).
Hence, the number of trajectories required to obtain a \( \Delta  \)
accurate sample with a fixed error rate is \( O\left( \frac{n_{a}^{2}R^{2}}{\Delta ^{2}}\right)  \). 

The proof of theorem \ref{CPI}, which guarantees the soundness of
conservative policy iteration, is straightforward and in the appendix.


\section{How Good is The Policy Found?}

Recall that the bound on the speed with which our algorithm ceases
does not depend on the restart distribution used. In contrast, we
now show that the quality of the resulting policy could strongly depend
on this distribution. 

The following lemma is useful:

\begin{lem}
\label{performance_difference}For any policies \( \tilde{\pi } \)
and \( \pi  \) and any starting state distribution \( \mu  \), \[
\eta _{\mu }(\tilde{\pi })-\eta _{\mu }(\pi )=\frac{1}{1-\gamma }E_{(a,s)\sim \tilde{\pi }d_{\tilde{\pi },\mu }}\left[ A_{\pi }(s,a)\right] \]

\end{lem}
\begin{proof}
Let \( P_{t}(s^{\prime })\equiv P(s_{t}=s^{\prime };\tilde{\pi },s_{0}=s) \).
By definition of \( V_{\tilde{\pi }}(s) \), \begin{eqnarray*}
 &  & V_{\tilde{\pi }}(s)\\
 & = & (1-\gamma )\sum _{t=0}^{\infty }\gamma ^{t}E_{(a_{t},s_{t})\sim \tilde{\pi }P_{t}}\left[ \mathcal{R}(s_{t},a_{t})\right] \\
 & = & \sum _{t=0}^{\infty }\gamma ^{t}E_{(a_{t},s_{t})\sim \tilde{\pi }P_{t}}\left[ (1-\gamma )\mathcal{R}(s_{t},a_{t})\right. \\
 &  & \left. +V_{\pi }(s_{t})-V_{\pi }(s_{t})\right] \\
 & = & \sum _{t=0}^{\infty }\gamma ^{t}E_{(a_{t},s_{t},s_{t+1})\sim \tilde{\pi }P_{t}P(s_{t+1};s_{t},a_{t})}\left[ (1-\gamma )\mathcal{R}(s_{t},a_{t})\right. \\
 &  & \left. +\gamma V_{\pi }(s_{t+1})-V_{\pi }(s_{t}))\right] +V_{\pi }(s)\\
 & = & V_{\pi }(s)+\sum _{t=0}^{\infty }\gamma ^{t}E_{(a_{t},s_{t})\sim \tilde{\pi }P_{t}}\left[ A_{\pi }(s_{t},a_{t})\right] \\
 & = & V_{\pi }(s)+\frac{1}{1-\gamma }E_{(a,s^{\prime })\sim \tilde{\pi }d_{\tilde{\pi },s}}\left[ A_{\pi }(s^{\prime },a)\right] 
\end{eqnarray*}
 and the result follows by taking the expectation of this equation
with respect to \( \mu  \).
\end{proof}
This lemma elucidates a fundamental \emph{measure mismatch}. The performance
measure of interest, \( \eta _{D}(\pi ) \) changes in proportion
the policy advantage \( \mathbb {A}_{\pi ,D}(\pi ^{\prime }) \) for
small \( \alpha  \) (see equation \ref{local_improve}), which is
the average advantage under the state distribution \( d_{\pi ,D} \).
However, for an optimal policy \( \pi ^{*} \), the difference between
\( \eta _{D}(\pi ^{*}) \) and \( \eta _{D}(\pi ) \) is proportional
to the average advantage under the state distribution \( d_{\pi ^{*},D} \).
Thus, even if the optimal policy advantage is small with respect to
\( \pi  \) and \( D \), the advantages may not be small under \( d_{\pi ^{*},D} \).
This motivates the use of the more uniform distribution \( \mu  \).

After termination, our algorithm returns a policy \( \pi  \) which
has small policy advantage with respect to \( \mu  \). We now quantify
how far from optimal \( \pi  \) is, with respect to an arbitrary
measure \( \tilde{\mu } \).

\begin{thm}
\label{th_quality}Assume that for a policy \( \pi  \), \( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })<\varepsilon  \).
Let \( \pi ^{*} \) be an optimal policy. Then for any state distribution
\( \tilde{\mu } \), \begin{eqnarray*}
\eta _{\tilde{\mu }}(\pi ^{*})-\eta _{\tilde{\mu }}(\pi )\leq \frac{\varepsilon }{(1-\gamma )}\left\Vert \frac{d_{\pi ^{*},\tilde{\mu }}}{d_{\pi ,\mu }}\right\Vert _{\infty } &  & \\
\leq \frac{\varepsilon }{(1-\gamma )^{2}}\left\Vert \frac{d_{\pi ^{*},\tilde{\mu }}}{\mu }\right\Vert _{\infty }\, . &  & 
\end{eqnarray*}

\end{thm}
\begin{proof}
The optimal policy advantage is \( \textrm{OPT}(\mathbb {A}_{\pi ,\mu })=\sum _{s}d_{\pi ,\mu }(s)\max _{a}A_{\pi }(s,a) \).
Therefore, \begin{eqnarray*}
\varepsilon  & > & \sum _{s}\frac{d_{\pi ,\mu }(s)}{d_{\pi ^{*},\tilde{\mu }}(s)}d_{\pi ^{*},\tilde{\mu }}(s)\max _{a}A_{\pi }(s,a)\\
 & \geq  & \min _{s}\left( \frac{d_{\pi ,\mu }(s)}{d_{\pi ^{*},\tilde{\mu }}(s)}\right) \sum _{s}d_{\pi ^{*},\tilde{\mu }}(s)\max _{a}A_{\pi }(s,a)\\
 & \geq  & \left\Vert \frac{d_{\pi ^{*},\tilde{\mu }}}{d_{\pi ,\mu }}\right\Vert _{\infty }^{-1}\sum _{s,a}d_{\pi ^{*},\tilde{\mu }}(s)\pi ^{*}(a;s)A_{\pi }(s,a)\\
 & = & (1-\gamma )\left\Vert \frac{d_{\pi ^{*},\tilde{\mu }}}{d_{\pi ,\mu }}\right\Vert _{\infty }^{-1}(\eta _{\tilde{\mu }}(\pi ^{*})-\eta _{\tilde{\mu }}(\pi ))
\end{eqnarray*}
where the last step follows from lemma \ref{performance_difference}.
The second inequality is due to \( d_{\pi ,\mu }(s)\leq (1-\gamma )\mu (s) \).
\end{proof}
Note that \( \left\Vert \frac{d_{\pi ^{*},\tilde{\mu }}}{\mu }\right\Vert _{\infty } \)
is a measure of the mismatch in using \( \mu  \) rather than the
future-state distribution of an optimal policy. The interpretation
of each factor of \( \frac{1}{1-\gamma } \) is important. In particular,
one factor of \( \frac{1}{1-\gamma } \) is due to the fact that difference
between the performance of \( \pi  \) and optimal is \( \frac{1}{1-\gamma } \)
times the average advantage under \( d_{\pi ^{*},\tilde{\mu }}(s) \)
(see lemma \ref{performance_difference}) and another factor of \( \frac{1}{1-\gamma } \)
is due to the inherent non-uniformity of \( d_{\pi ,\mu } \) (since
\( d_{\pi ,\mu }(s)\leq (1-\gamma )\mu (s) \)). 


\section{Discussion}

We have provided an algorithm that finds an {}``approximately{}''
optimal solution that is polynomial in the approximation parameter
\( \varepsilon  \), but not in the size of the state space. We discuss
a few related points.


\subsection{The Greedy Policy Chooser}

The ability to find a policy with a large policy advantage can be
stated as a regression problem though we don't address the sample
complexity of this problem. Let us consider the error given by:\[
E_{s\sim d_{\pi ,\mu }}\max _{a}|A_{\pi }(s,a)-f_{\pi }(s,a)|\, .\]
This loss is an \emph{average} loss over the state space (though it
is an \( \infty  \)-loss over actions). It is straightforward to
see that if we can keep this error below \( \frac{\varepsilon }{2} \),
then we can construct an \( \varepsilon  \)-greedy policy chooser
by choosing a greedy policy based on these approximation \( f_{\pi } \).
This \( l_{1} \) condition for the regression problem is a much weaker
constraint than minimizing an \( l_{\infty } \)-error over the state-space,
which is is the relevant error for greedy dynamic programming (see
equation \ref{greedy_pi} and \cite{neuro-programming}).

Direct policy search methods could also be used to implement this
greedy policy chooser. 


\subsection{What about improving \protect\( \eta _{D}\protect \)?}

Even though we ultimately seek to have good performance measure under
\( \eta _{D} \), we show that it is important to improve the policy
under a somewhat uniform measure. An important question is {}``Can
we improve the policy according to both \( \eta _{D} \) and \( \eta _{\mu } \)
at each update?{}'' In general the answer is {}``no{}'', but consider
improving the performance under \( \tilde{\mu }=(1-\beta )\mu +\beta D \)
instead of just \( \mu  \). This metric only slightly changes the
quality of the asymptotic policy. However by giving weight to \( D \),
the possibility of improving \( \eta _{D} \) is allowed if the optimal
policy has large advantages under \( D \), though we do not formalize
this here. The only situation where joint improvement with \( \eta _{D} \)
is not possible is when \( \textrm{OPT}(\mathbb {A}_{\pi ,D}) \)
is small. However, this is the problematic case where, under \( D \),
the large advantages are not at states visited frequently.


\subsection{Implications of the mismatch}

The bounds we have presented directly show the importance of ensuring
the agent starts in states where the optimal policy tends to visit.
It also suggests that certain optimal policies are easier to learn
in large state spaces --- namely those optimal policies which tend
to visit a significant fraction of the state space. An interesting
suggestion for how to choose \( \mu  \), is to use prior knowledge
of which states an optimal policy tends to visit.


\subsection*{Acknowledgments}

We give warm thanks to Peter Dayan for numerous critical comments.

\bibliographystyle{plain}
\bibliography{bib}



\section{Appendix: Proofs}

The intuition for the proof of theorem \ref{th_bound} is that \( \alpha  \)
determines the probability of choosing an action from \( \pi ^{\prime } \).
If the current state distribution is \( d_{\pi ,\mu } \) when an
action from \( \pi ^{\prime } \) is chosen, then the performance
improvement is proportional to the policy advantage. The proof involves
bounding the performance decrease due to the state distribution not
being exactly \( d_{\pi ,\mu } \), when an action from \( \pi ^{\prime } \)
is chosen.

\begin{proof}
Throughout the proof, the \( \mu  \) dependence is not explicitly
stated. For any state \( s \),\begin{eqnarray*}
 &  & \sum _{a}\pi _{\textrm{new}}(s;a)A_{\pi }(s,a)\\
 & = & \sum _{a}((1-\alpha )\pi (a;s)+\alpha \pi ^{\prime }(a;s))A_{\pi }(s,a)\\
 & = & \alpha \sum \pi ^{\prime }(a;s)A_{\pi }(s,a)\, .
\end{eqnarray*}
where we have used \( \sum _{a}\pi (a;s)A_{\pi }(s,a)=0 \). 

For any timestep, the probability that we choose an action according
to \( \pi ^{\prime } \) is \( \alpha  \). Let \( c_{t} \) be the
random variable indicating the number of actions chosen from \( \pi ^{\prime } \)
before time \( t \). Hence, \( \Pr (c_{t}=0)=(1-\alpha )^{t} \).
Defining \( \rho _{t}\equiv \Pr (c_{t}\geq 1)=1-(1-\alpha )^{t} \)
and \( P(s_{t};\pi ) \) to be distribution over states at time \( t \)
while following \( \pi  \) starting from \( s\sim \mu  \), it follows
that\begin{eqnarray*}
 &  & E_{s\sim P(s_{t};\pi _{\textrm{new}})}\left[ \sum _{a}\pi _{\textrm{new}}(a;s)A_{\pi }(s,a)\right] \\
 & = & \alpha E_{s\sim P(s_{t};\pi _{\textrm{new}})}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s,a)\right] \\
 & = & \alpha (1-\rho _{t})E_{s\sim P(s_{t}|c_{t}=0;\pi _{\textrm{new}})}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s,a)\right] \\
 &  & +\alpha \rho _{t}E_{s\sim P(s_{t}|c_{t}\geq 1;\pi _{\textrm{new}})}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s,a)\right] \\
 & \geq  & \alpha E_{s\sim P(s_{t}|c_{t}=0;\pi _{\textrm{new}})}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s,a)\right] \\
 &  & -2\alpha \rho _{t}\varepsilon \\
 & = & \alpha E_{s\sim P(s_{t};\pi )}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s,a)\right] -2\alpha \rho _{t}\varepsilon 
\end{eqnarray*}
 where we have used the definition of \( \varepsilon  \) and \( P(s_{t}|c_{t}=0;\pi _{\textrm{new}})=P(s_{t};\pi ) \). 

By substitution and lemma \ref{performance_difference}, we have:\begin{eqnarray*}
 &  & \eta _{\mu }(\pi _{\textrm{new}})-\eta _{\mu }(\pi )\\
 & = & \sum _{t=0}^{\infty }\gamma ^{t}E_{s\sim P(s_{t};\pi _{\textrm{new}})}\left[ \sum _{a}\pi _{\textrm{new}}(a;s)A_{\pi }(s,a)\right] \\
 & \geq  & \alpha \sum _{t=0}^{\infty }\gamma ^{t}E_{s\sim P(s_{t};\pi )}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s,a)\right] \\
 &  & -2\alpha \varepsilon \sum _{t=0}^{\infty }\gamma ^{t}(1-(1-\alpha )^{t})\\
 & = & \frac{\alpha }{1-\gamma }E_{s\sim d_{\pi }}\left[ \sum _{a}\pi ^{\prime }(a;s)A_{\pi }(s_{t},a)\right] \\
 &  & -2\alpha \varepsilon (\frac{1}{1-\gamma }-\frac{1}{1-\gamma (1-\alpha )})\, .
\end{eqnarray*}
 The result follows from simple algebra.
\end{proof}
The proof of theorem \ref{CPI} follows.

\begin{proof}
During step (2), we need enough samples such that \( |\mathbb {A}-\mathbb {\hat{A}}|<\frac{\varepsilon }{3} \)
for every loop of the algorithm and, as proved below, we need to consider
at most \( \frac{72R^{2}}{\varepsilon ^{2}} \) loops. If we demand
that the probability of failure is less than \( \delta  \), then
by the union bound and inequality \ref{hoeffding}, we have for \( k \)
trajectories \( P(\textrm{failure})<\frac{72R^{2}}{\varepsilon ^{2}}2e^{-\frac{k\varepsilon ^{2}}{36n_{a}^{2}R^{2}}}<\delta  \),
where we have taken \( \Delta =\frac{\varepsilon }{6} \) since the
bias in our estimates is at most \( \frac{\varepsilon }{6} \). Thus,
we require \( O\left( \frac{R^{2}}{\varepsilon ^{2}}\log \frac{R^{2}}{\delta \varepsilon ^{2}}\right)  \)
trajectories.

Thus, if \( \mathbb {\hat{A}}\geq \frac{2\varepsilon }{3} \), then
\( \mathbb {A}\geq \frac{\varepsilon }{3}>0 \). By corollary \ref{th_how_far},
step 4 guarantees improvement of \( \eta _{\mu } \) by at least \( \frac{\left( \mathbb {\hat{A}}-\frac{\varepsilon }{3}\right) ^{2}}{8R}\geq \frac{\varepsilon ^{2}}{72R} \)
using \( \alpha =\frac{(1-\gamma )(\mathbb {\hat{A}}-\frac{\varepsilon }{3})}{4R} \),
which proves \( i \). Since \( 0\leq \eta _{\mu }\leq R \), there
are at most \( \frac{72R^{2}}{\varepsilon ^{2}} \) steps before \( \eta _{\mu } \)
becomes larger than \( R \), so the algorithm must cease in this
time, which proves \( ii \). In order to cease and return \( \pi  \),
on the penultimate loop, the \( G_{\varepsilon } \) must have returned
some \( \pi ^{\prime } \) such that \( \mathbb {\hat{A}}<\frac{2\varepsilon }{3} \),
which implies \( \mathbb {A}_{\pi }(\pi ^{\prime })<\varepsilon  \).
By definition of \( G_{\varepsilon } \), it follows that \( \textrm{OPT}(\mathbb {A}_{\pi })<2\varepsilon  \),
which proves \emph{iii.} \end{proof}

\end{document}
