\documentclass[twoside,11pt]{llncs}
\usepackage{amsmath,amssymb,latexsym}
\usepackage{epsfig}
\usepackage{natbib}
\usepackage{graphicx}
\bibliographystyle{plainnat}
%\input{localdeclarations}
\input epsf
\newtheorem{construction}{Construction}
\newtheorem{slaim}{Claim}
\newtheorem{sublemma}{Sublemma}
\newcommand{\commentout}[1]{}
\newcommand{\naturals}{\ensuremath{\mathbb N}}
\newcommand{\cX}{\ensuremath{{\cal X}}}
\newcommand{\cY}{\ensuremath{{\cal Y}}}
\newcommand{\cZ}{\ensuremath{{\cal Z}}}
\newcommand{\cC}{\ensuremath{{\cal C}}}
\newcommand{\cP}{\ensuremath{{\cal P}}}
\newcommand{\cA}{\ensuremath{{\cal A}}}
\newcommand{\cB}{\ensuremath{{\cal B}}}
\newcommand{\cD}{\ensuremath{{\cal D}}}
\newcommand{\condent}{H}
\newcommand{\erateS}{\hat{e}_S}
%peter2added
\newcommand{\erateShard}{\hat{e}_{S,\text{h}}}
\newcommand{\erateSeasy}{\hat{e}_{S,\text{easy}}}
\newcommand{\Shard}{S_{\text{h}}}
%
\newcommand{\erateD}{e_D}
\newcommand{\ebad}{e_{\text{bad}}}
\newcommand{\KL}{\text{\sc KL}}
\newcommand{\cbayesopt}{c_{D \text{\sc opt}}}
\newcommand{\ebayes}{e_D(c_{\text{\sc Bayes}(P,S)})}
\newcommand{\emap}{e_D(c_{\text{\sc map}(P,S)})}
\newcommand{\esmap}{e_D(c_{\text{\sc smp}(P,S)})}
\newcommand{\hesmapb}{\hat{e}_S(c^*)}
\newcommand{\he}{\hat{e}_S(c)}
\newcommand{\emdl}{e_D(c_{\text{\sc mdl}(P,S)})}
\newcommand{\eorb}{e_D(c_{\text{\sc orb}(P,S)})}
\newcommand{\cbayes}{c_{\text{\sc Bayes}(P,S)}}
\newcommand{\cbayesi}{c_{\text{\sc Bayes}(P,S^{i1})}}
\newcommand{\cmap}{c_{\text{\sc map}(P,S)}}
\newcommand{\csmap}{c_{\text{\sc smp}(P,S)}}
\newcommand{\csmapb}{c_{\text{\sc smp}}}
\newcommand{\cmdl}{c_{\text{\sc mdl}(P,S)}}
\newcommand{\corb}{c_{\text{\sc orb}(P,S)}}
\newcommand{\femdl}{e_{f(c)}(c_{\text{\sc mdl}(P,S)})}
\newcommand{\feorb}{e_{f(c)}(c_{\text{\sc orb}(P,S)})}
\newcommand{\febayes}{e_{f(c)}(c_{\text{\sc Bayes}(P,S)})}
\newcommand{\femap}{e_{f(c)}(c_{\text{\sc map}(P,S)})}
\newcommand{\fesmap}{e_{f(c)}(c_{\text{\sc smp}(P,S)})}
\newcommand{\phard}{p_{\text{h}}}
%\newcommand{\muhard}{\mu_{\text{h}}}
\newcommand{\Mhard}{M_{\text{h}}}
\newcommand{\mhard}{m_{\text{h}}}
\newcommand{\psmall}{p_{\text{small}}}
\newcommand{\pmiddle}{p_{\text{middle}}}
\newcommand{\plarge}{p_{\text{large}}}
% Heading arguments are {volume}{year}{pages}{submitted}{published}{authorfullnames}
% Short headings should be running head and authors last names
\title{Suboptimal Behavior of Bayes and MDL \\
in Classification under Misspecification}
%\author{Peter Gr\"unwald (pdg@cwi.nl), CWI Amsterdam, The Netherlands \\
% John Langford (jl@hunch.net), TTIChicago, Chicago, USA}
\author{Peter Gr\"unwald\inst{1} \and John Langford\inst{2}}
\institute{CWI Amsterdam,
\email{pdg@cwi.nl}, \texttt{www.grunwald.nl}.
\and TTIChicago, \email{jl@hunch.net}, \texttt{hunch.net/\~\/jl/}.}
\setcounter{secnumdepth}{3}
%to make sure that subsubsections get a number again!
\pagestyle{plain}
\thispagestyle{plain}
%remove again in final version!
\begin{document}
\maketitle
\begin{abstract}
We show that forms of Bayesian and MDL inference that are often
applied to classification problems can be {\em inconsistent}. This
means that there exists a learning problem such that for all amounts
of data the generalization errors of the MDL classifier and the
Bayes classifier relative to the Bayesian posterior both remain
bounded away from the smallest achievable generalization error. From
a Bayesian point of view, the result can be reinterpreted as saying
that Bayesian inference can be inconsistent under misspecification,
even for countably infinite models. We extensively discuss the
result from both a Bayesian and an MDL perspective.
\end{abstract}
\section{Introduction}
Overfitting is a central concern of machine learning and
statistics. Two frequently used learning methods that in many cases
`automatically' protect against overfitting are Bayesian inference
\citep{BernardoS94} and the Minimum Description Length (MDL) Principle
\citep{Rissanen89,BarronRY98,Grunwald05,Grunwald07}. We show that, when applied
to classification problems, some of the standard variations of these
two methods can be {\em inconsistent\/} in the sense that they {\em
asymptotically overfit\/}: there exist scenarios where, no matter how
much data is available, the generalization error of a classifier based
on MDL or the full Bayesian posterior does not converge to the minimum
achievable generalization error within the set of classifiers under
consideration.
This result may be viewed as a challenge to Bayesian inference. Given
a powerful piece of information (an objectively correct ``prior'' on a
set of {\em classifiers\/}), transforming this information into a
Bayesian prior on a set of {\em distributions\/} in a straightforward
manner and applying Bayes rule yields significantly suboptimal
behavior  while another simple approach yields optimal behavior. The key is
the transformation from classifiers (functions mapping each input $X$
to a discrete class label $Y$) to (conditional) distributions. Such a
transformation is necessary because Bayes rule cannot be directly
applied to classifiers. We do the conversion in a straightforward
manner, crossing a prior on classifiers with a prior on error rates
for these classifiers.
%peter2 This conversion method is completely standard
This conversion method is a completely standard tool for Bayesians
active in the field of machine learning
%
 we tested this with some professing Bayesians, see
Section~\ref{sec:bayestest}  yet it inevitably leads to (sometimes
subtly) `misspecified' probability models not containing the `true'
distribution $D$. The result may therefore be reinterpreted as
`Bayesian inference can be inconsistent under
misspecification for common classification probability models'.
Since, in practice, Bayesian inference for classification tasks is
frequently and inevitably based on misspecified probability models,
the result remains relevant even if (as many Bayesians do, especially
those not active in the field of machine learning) one insists
that inference starts directly with a probability model, rather than a
classification model that is then transformed into a probability model
 see Section~\ref{sec:bayes}.
There are two possible resolutions to this challenge. Perhaps
Bayesian inference is an incomplete characterization of learning:
there exist pieces of information (e.g. prior information on
deterministic classifiers rather than distributions) which can not be well
integrated into a prior distribution and so learning algorithms other
than Bayesian inference are sometimes necessary. Or, perhaps there is
some less naive method allowing a prior to express the available
information. We discuss this issue further in Section~\ref{sec:bayes}.
%peter1 end of changed part
\commentout{
%peter1 we are so careful now that this is not needed any more.
\paragraph{Some Caveats and Warnings} Our result must be interpreted
carefully. There exist many different versions of MDL and Bayesian
inference, only some of which are covered. For the case of MDL, we
show the result for a twopart form of MDL that has often been used
for classification, see Section~\ref{sec:mdl}. For the case of Bayes,
the result may appear to contradict some wellknown Bayesian
consistency results \citep{BlackwellD62}. Indeed, the result only
applies to a `pragmatic' use of Bayes involving a transformation of classifiers to conditional distributions.
%peter1 moved this piece up
Indeed, the result only applies to a `pragmatic' use of Bayes, where
the set of hypotheses under consideration are {\em classifiers}:
functions mapping each input $X$ to a discrete class label $Y$. To
apply Bayes rule, these classifiers must be converted into
conditional probability distributions. We do this conversion in a
standard manner, crossing a prior on classifiers with a prior on
error rates for these classifiers. }
\subsection{A Preview}
\subsubsection{Classification Problems}
A classification problem is defined on an input (or feature) domain
$\cX$ and output domain (or class label) $\cY = \{0,1\}$. The problem
is defined by a probability distribution $D$ over $\cX
\times \cY$. A classifier is a function
$c: \cX \rightarrow \cY$. The error
rate of any classifier is quantified as:
$$\erateD(c) = E_{(x,y) \sim D} I(c(x) \neq y)$$
where $(x,y) \sim D$ denotes
a draw from the distribution $D$ and $I(\cdot)$ is the indicator
function which is $1$ when its argument is true and $0$ otherwise.
The goal is to find a classifier which, as often as possible according to $D$,
correctly predicts the class label given the input feature.
Typically, the classification problem is solved by searching for some
classifier $c$ in a limited subset $\cC$ of all classifiers using a
sample $S = (x_1,y_1), \ldots, (x_m,y_m) \sim D^m$
generated by $m$ independent draws from
the distribution $D$. Naturally, this search is guided by the
{\em empirical error rate}. This is the error rate on the subset $S$ defined by:
$$
\erateS(c) :=
E_{(x,y) \sim S} I(c(x)\neq y) = \frac{1}{S} \sum_{(x,y)\in S} I(c(x) \neq y).
$$ where $(x,y) \sim S$ denotes a sample drawn from the uniform
distribution on $S$. Note that $\erateS(c)$ is a random variable
dependent on a draw from $D^m$. In contrast, $\erateD(c)$ is a number (an
expectation) relative to $D$.
\subsubsection{The Basic Result}
The basic result is that certain classifier learning algorithms may
not behave well as a function of the information they use, even when
given infinitely many samples to learn from. The learning algorithms
we analyze are ``Bayesian classification'' (Bayes), ``Maximum a
Posteriori classification'' (MAP), ``Minimum Description Length
classification'' (MDL) and ``Occam's Razor Bound classification''
(ORB). These algorithms are precisely defined later. Functionally
they take as arguments a training sample $S$ and a ``prior'' $P$ which
is a probability distribution over a set of classifiers $\cC$. The
result applies even when the process creating classification problems
draws the optimal classifier from $P(c)$. In Section~\ref{sec:result}
we state the basic result, Theorem~\ref{thm:bayesinconsistent}. The
theorem has the following corollary, indicating suboptimal behavior of
Bayes and MDL:
\commentout{
\begin{corollary}
\label{cor:inconsistent}
{\bf (Classification Inconsistency) \ } There exists an input
domain ${\cal X}$ and an (objectively correct) prior $P(c)$ on
a countable set of classifiers $\cC$ such that the Bayesian
classifier $\cbayes$, the MAP classifier $\cmap$, and the MDL
classifier $\cmdl$ are asymptotically suboptimal with respect
to the ORB classifier $\corb$. That is:
\begin{multline}
\lim_{m \rightarrow \infty} \Pr_{S \sim D^m} \left(
\eorb + 0.1 < \min \{ \ebayes, \emap, \emdl \} \right) = 1.
\end{multline}
\end{corollary}
}
\begin{corollary}
\label{cor:inconsistent}
{\bf (Classification Inconsistency) \ } There exists an input
domain ${\cal X}$ and a prior $P(c)$ on
a countable set of classifiers $\cC$ such that:
\begin{description}
\item[\rm 1. (inconsistency according to true distribution)] There exists a learning problem (distribution) $D$ such that the Bayesian
classifier $\cbayes$, the MAP classifier $\cmap$, and the MDL
classifier $\cmdl$ are asymptotically suboptimal with respect
to the ORB classifier $\corb$. That is, for $c^* \in \{ \cbayes, \cmap, \cmdl \}$,
\end{description}
\begin{equation}
\label{eq:hulk}
\lim_{m \rightarrow \infty} \Pr_{S \sim D^m} \left(
\eorb + 0.05 < e_D(c^*) \right) = 1.
\end{equation}
\begin{description}
\item[\rm 2. (inconsistency according to prior)]
There exists a randomized algorithm selecting learning problems $D$ in
such a way that
\subitem(a) the prior $P(c)$ is `objectively correct' in the
sense that, for all $c \in \cC$, with probability $P(c)$, $c$ is the optimal classifier,
achieving $\min_{c \in \cC} e_D(c)$.
\subitem(b) {\rm (\ref{eq:hulk})} holds no matter what learning problem $D$/classifier
$c$ is selected. In particular, {\rm (\ref{eq:hulk})} holds with prior
probability 1.
\end{description}
\end{corollary}
How dramatic is this result? We may ask
\begin{enumerate}
\item Are the priors $P$ for
which the result holds natural?
\item How large can the suboptimality
become and how small can $\eorb$ be?
\item Does this matter for logarithmic loss (which is what MDL approaches seek to minimize \citep{Grunwald07}) rather than 01 loss?
\item Is $\corb$ an algorithm which
contains information specific to the distribution $D$?
\item Is this
theorem relevant at small (and in particular noninfinite) sample
sizes?
\end{enumerate}
We will ask a number of more detailed questions from a Bayesian
perspective in Section~\ref{sec:bayes} and from an MDL perspective in
Section~\ref{sec:mdl}. The short answer to (1) and (2) is: the priors
$P$ have to satisfy several requirements, but they correspond to
priors often used in practice. The size of the suboptimality can be
quite large, at least for the MAP and MDL classifiers (the number of
$0.05$
%peter1 changed this.
was just chosen for concreteness;
other values are possible) and $\eorb$ can be quite small  see
Section~\ref{sec:technical} and Figure~\ref{fig:allowed}. The short
answer to (3) is ``yes''. A similar result holds for logarithmic
loss, see Section~\ref{sec:bayescons}.
The answer to (4) is ``no''. The algorithm $\corb$, which minimizes
the Occam's Razor bound (ORB) (see \citep{BlumerEHW87} or
Section~\ref{sec:orb}), is asymptotically consistent for any $D$:
\begin{theorem}
\label{thm:consistent}
{\bf (ORB consistency) \ } For all priors $P$ nonzero on a set
of classifiers $\cC$, for all learning problems $D$, and all constants
$K> 0$ the ORB classifier $\corb$
is asymptotically $K$optimal:
$$
\lim_{m \rightarrow \infty} \Pr_{S \sim D^m} \left(\eorb > K + \inf_{c \in \cC} \erateD(c) \right) = 0.$$
\end{theorem}
The answer to (5) is that the result is very relevant for small sample
sizes because the convergence to the probability 1 event occurs at a
speed exponential in $m$. Although the critical example uses a
countably infinite set of classifiers, on a finite set of $n$
classifiers, the analysis implies that for $m < \log n$, Bayesian
inference gives poorer performance than Occam's Razor Bound
optimization.
\paragraph{Overview of the Paper}
The remainder of this paper first defines precisely what we mean by
the above classifiers. It then states the main inconsistency theorem
which implies the above corollary, as well as a theorem that provides
an upperbound on how badly Bayes can behave. In
Section~\ref{sec:proofs} we prove the theorems. Technical discussion,
including variations of the result, are discussed in
Section~\ref{sec:technical}. A discussion of the result from a
Bayesian point of view is given in Section~\ref{sec:bayes}, and from
an MDL point of view in Section~\ref{sec:mdl}.
\section{Some Classification Algorithms}
\label{sec:algorithms}
The basic inconsistency result is about particular classifier learning
algorithms which we define next.
\subsection{The Bayesian Classification algorithm}
The Bayesian approach to inference starts with a prior probability
distribution $P$ over a set of distributions $\cP$. $P$ typically
represents a measure of ``belief'' that some $p \in \cP$ is the
process generating data. Bayes' rule states that, given sample data $S$,
the posterior
probability $P(\cdot \mid S)$
that some $p$ is the process generating the data
is:
\begin{equation}
\label{eq:bayesrules}
P(p \mid S) =\frac{p(S)P(p)}{P(S)}.
\end{equation}
where $P(S) := E_{p \sim P} [p(S)] = \sum_{p \in \cP} P(p)p(S)$, the sum being replaced by an integral when $\cP$ is continuous and $P$ admits a density.
Note that in Bayesian statistics, $p(S)$ is usually denoted as $P(S \mid p)$.
In classification problems with sample size $m = S$,
each $p \in \cP$ is a distribution on $(\cX \times \cY)^m$
and the outcome $S=(x_1,y_1), \ldots, (x_m,y_m)$ is the sequence
of labeled examples.
If we intend to perform classification based on a set of classifiers
$\cC$ rather than distributions $\cP$, it is natural to introduce a
``prior'' $P(c)$ that a particular classifier $c: \cX \rightarrow
\{0,1\}$ is the best classifier for solving some learning problem.
This, of course, is \emph{not} a Bayesian prior in the conventional
sense because classifiers do not induce a measure over the training
data.
%peter: I felt a few lines were missing here! Added some extra explanation
In order to apply Bayesian inference, we somehow need to convert the
set of classifiers into a corresponding set of distributions $\cP$.
With such a conversion, the prior $P(c)$ will induce a conventional
Bayesian prior on $\cP$ after all.
%peter: end of added part
%It is the standard method of converting a ``prior'' over
%classifiers into a Bayesian prior over distributions on the
%observations which the inconsistency result applies to.
One common conversion \citep{Jordan95,Tipping01,Grunwald98b}
transforms the set of classifiers $\cC$ into a simple logistic
regression model  the precise relationship to logistic regression is
discussed in Section~\ref{sec:trafo}. In the special case
considered in this paper, $c(x) \in \{0,1\}$ is binary valued.
Then (but only then) the conversion amounts to
assuming that the error rate $\theta$ of the optimal classifier is
independent of the feature value $x$. This is known as
``homoskedasticity'' in statistics and ``label noise'' in learning
theory. More precisely,
%peter3 changed
we let ${\cal P}$ consist of the set of distributions $p_{c,\theta}$,
where $c \in {\cal C}$ and $\theta \in [0,1]$. These are defined as
conditional probability distributions over the labels given the
unlabeled data:
\begin{equation}
\label{eq:regmodel}
p_{c,\theta}(y^m \mid x^m) =
\theta^{m \hat{e}_S(c)} (1 \theta)^{m  m \erateS(c)}.
\end{equation}
This expresses that there exists some $\theta$ such that $\forall
x\,\,\, P_{c,\theta}(c(X) \neq y \mid X=x) = \theta$. (homoskedasticity).
Note that
$$p_{c,\theta}(y x) = \begin{cases}
\theta & \text{if $c(x) \neq y$} \\
1 \theta & \text{if $c(x) = y$}.
\end{cases}
$$
For each fixed $\theta < 0.5$, the log likelihood
$\log p_{c,\theta}(y^m \mid x^m)$ is linearly decreasing in the empirical
error that $c$ makes on $S$.
By differentiating with respect to
$\theta$, we see that for fixed $c$, the likelihood
(\ref{eq:regmodel}) is maximized by setting $\theta :=
\erateS(c)$, giving
\begin{equation}
\label{eq:safe}
\log \frac{1}{p_{c,\erateS(c)}(y^m \mid x^m)} = m H(\hat{e}_S(c)),
\end{equation}
where $H$ is the binary entropy $H(\mu) = \mu \log \frac{1}{\mu} + (1
\mu) \log \frac{1}{1 \mu}$, which is strictly increasing for $\mu \in
[0,0.5)$. Here, as everywhere else in the paper, $\log$ stands for
binary logarithm.
%peter: added
Thus, the conversion is ``reasonable'' in the sense that, both with
fixed $\theta < 0.5$ and with the likelihoodmaximizing $\theta =
\erateS(c)$ which varies with the data, the likelihood is a decreasing
function in the empirical error rate of $c$, so that classifiers which
achieve small error on the data correspond to a large likelihood of
the data.
%
We further assume that {\em some\/} distribution $p_x$ on
${\cal X}^m$ generates the $x$values, and, in particular that
this distribution is independent of $c$ and $\theta$. With this assumption,
we can apply
Bayes rule to get a posterior on $p_{c,\theta}$, denoted as
$P(c,\theta \mid S)$, without knowing $p_x$, since the
$p_x(x^m)$factors cancel:
\begin{equation}
\label{eq:bayes}
P({c,\theta} \mid S) = \frac{p_{c,\theta}(y^m  x^m)p_x(x^m)P(c,\theta)}{P(y^m \mid x^m) p_x(x^m)} =
\frac{p_{c,\theta}(y^mx^m)P(c,\theta)}{E_{{c,\theta} \sim P}
[p_{c,\theta}(y^m \mid x^m)] }.
\end{equation}
As is customary in Bayesian statistics, here as well as in the
remainder of this paper we defined the prior $P$ over $(c,\theta)$
rather than directly over $p_{c,\theta}$. The latter choice would be
equivalent but notationally more cumbersome.
To make (\ref{eq:bayes})
applicable, we need to specify a prior measure on the joint space $\cC
\times [0,1]$ of classifiers and $\theta$parameters. In the next
section we discuss the priors under which the theorems hold.
Bayes rule (\ref{eq:bayes}) is formed into a classifier learning
algorithm by choosing the most likely label given the input $x$ and
the posterior $P(\cdotS)$:
\begin{equation}
\label{eq:bayesact}
\cbayes(x) := \begin{cases}
1 & \text{if} \ E_{{c,\theta} \sim P(\cdot \mid S)} \/[ p_{c,\theta}(Y =
1X=x) ] >
\frac{1}{2}, \\
0 & \text{if} \ E_{{c,\theta} \sim P(\cdot \mid S)} \/[ p_{c,\theta}(Y =
1X=x) ] <
\frac{1}{2}.
\end{cases}
\end{equation}
%peter2 changed (need this extended formulation now in proof of theorem 2)
If $E_{{c,\theta} \sim P(\cdot \mid S)} \/[ p_{c,\theta}(Y =
1X=x) ] =
\frac{1}{2}$, then the value of $\cbayes$ is determined by
an independent toss of a fair coin.
\subsection{The MAP classification algorithm}
The integrations of the full Bayesian classifier can be too
computationally intensive, so in practice one often predicts using the
Maximum A Posteriori (MAP) classifier. This classifier is given by:
$$
\cmap =
\arg \max_{c \in \cC} \max_{\theta \in [0,1]} P(c,\theta \mid S) =
\arg \max_{c \in \cC} \max_{\theta \in [0,1]}
p_{c,\theta}(y^m \mid x^m)P(c,\theta)
$$
with ties broken arbitrarily. Integration over $\theta \in [0,1]$
is easy compared to summation over $c \in
\cC$, so one sometimes uses a
learning algorithm (SMP) which integrates over
$\theta$ (like full Bayes) but maximizes over $c$ (like MAP):
$$
\csmap =
\arg \max_{c \in \cC} P(c \mid S) =
\arg \max_{c \in \cC}
E_{\theta \sim P(\theta)} p_{c,\theta}(y^m \mid x^m) P(c \mid \theta).
$$
\subsection{The MDL Classification algorithm}
The MDL approach to classification is transplanted from the MDL
approach to density estimation. There is no such thing as a
`definition' of MDL for classification because the
transplant has been performed in various ways by various authors.
Nonetheless, as we discuss in Section~\ref{sec:mdl}, many implementations are essentially equivalent to the
following algorithm \citep{QuinlanR89,Rissanen89,KearnsMNR97,Grunwald98b}:
\begin{equation}
\label{eq:cmdl}
\cmdl =
\arg \min_{c \in \cC} \ \left\{ \
\log \frac{1}{P(c)} + \log \binom{m}{m\hat{e}_S(c)} \ \right\}.
\end{equation}
The quantity minimized has a coding
interpretation: it is the number of bits required to describe the
classifier plus the number of bits required to describe the labels on
$S$ given the classifier and the unlabeled data. We call $ \log
P(c) + \log \binom{m}{m \hat{e}_S(c)}$ the {\em twopart MDL
codelength\/} for encoding data $S$ with classifier $c$.
\section{Main Theorems}
\label{sec:result}
We prove inconsistency for some countable set of classifiers
${\cal C} = \{c_0, c_1, \ldots \}$ which we define later. The
inconsistency is attained for priors with `heavy tails'. Formally, for
Theorem~\ref{thm:bayesinconsistent} (inconsistency of Bayes, MDL, MAP
and SMP), we
require $P(c_k)$ to be such
that for all $k$,
\begin{equation}
\label{eq:priora}
\log \frac{1}{P(c_k)} \leq \log k + o( \log k).
\end{equation}
This condition is satisfied, for example, by Rissanen's (1983) \nocite{Rissanen83} {\em
universal prior for the integers}. Another simple
prior satisfying (\ref{eq:priora}) can be defined as follows:
group the classifiers $c_1, c_2, \ldots$ as $\cC_0 := \{ c_1 \}$,
$\cC_1 := \{ c_2, c_3 \}$, $\cC_2 := \{ c_4, \ldots, c_7 \}$ and so
on, so that $\cC_a$ contains $2^a$ classifiers. Then the prior of any
classifier in $\cC_a$ is defined as $$P(c) = \frac{1}{a(a+1)}
2^{a}.$$
The sensitivity of our results to the choice of prior is
analyzed further in Section~\ref{sec:technical}. The prior on
$\theta$ can be any distribution $P$ on $[0,1]$ with a
density $w$ that is continuously
differentiable and bounded away from $0$ on $[0,0.5)$, i.e. for some
$\gamma > 0$,
\begin{equation}
\label{eq:priorb}
\text{for all $\theta \in [0,0.5)$}, w(\theta ) > \gamma.
\end{equation}
For example, we may take the uniform distribution on $[0,1]$ with $w(\theta)
\equiv 1$. We can also take the uniform distribution on $[0,0.5)$,
with $w(\theta) \equiv 2$.
For our result concerning the full Bayesian classifier,
Theorem~\ref{thm:bayesinconsistent}, Part (b), we need to make the
further restriction\footnote{Without this restriction, we may put
nonzero prior on distributions $p_{c,\theta}$ with $\theta > 1/2$.
For such distributions, the log likelihood of the data increases
rather than decreases as a linear function of the error that $c$
makes on the data. This implies that with a uniform prior, under our
definition of the Bayes MAP classifier, in some cases the MAP
classifier may be the classifier with the largest, rather than the
smallest empirical error. As pointed out by a referee, for this
reason the term ``Bayes MAP classifier'' may be somewhat of a
misnomer: it does not always coincide with the Bayes act
corresponding to the MAP distribution. If one insists on defining
the Bayes MAP classifier as the Bayes act corresponding to the MAP
distribution, then one can achieve this simply by restricting oneself to priors
satisfying (\ref{eq:restrictedprior}), since all our results still
hold under the restriction (\ref{eq:restrictedprior}).}
%peter2 added the above
\begin{equation}
\label{eq:restrictedprior}
P(\theta \geq 0.5) = 0.
\end{equation}
For ease of comparison
with other results (Section~\ref{sec:bayes}), we shall also allow
discrete priors on $\theta$ that put all their mass on a countable
set, $[0,1] \cap {\mathbb Q}$. For such priors, we require that they satisfy, for all $a \in
\naturals$, all $b \in \{0, 1, \ldots, \lfloor a/2 \rfloor \}$:
\begin{equation}
\label{eq:discreteprior}
P\left(\theta = \frac{b}{a}\right) \geq K_1 a^{K_2}
\end{equation}
for some fixed constants $K_1, K_2 > 0$. An example of a prior
achieving (\ref{eq:discreteprior}) is $P(\theta = b/ a) =
1/(a(a+1)\lfloor a/2 +1 \rfloor)$.
We assume that the priors $P(\theta)$ on
$[0,1]$ and the prior $P(c)$ on $\cC$ are fully dependent so that
every classifier can have its own error rate. We require each
classifier to have the same prior for $\theta$. Thus, $P(c,\theta) =
P(c) P(\thetac)$ where for every $c$, $P(\theta c)$ is set to
$P(\theta)$. Note that given a sample $S$, the posterior $P(\theta
\mid c, S)$ can depend on $c$. In the theorem, $H(\mu) = \mu \log
\frac{1}{\mu} + (1 \mu) \log \frac{1}{1 \mu}$ stands for the binary
entropy of a coin with bias $\mu$.
%peter2 added
The function $g(\mu)$ appearing in Part (b) of the theorem is defined
as
\begin{equation}
g(\mu) = \mu + \sup \ \{\nu \mid \nu \geq 0 \; ; \; H(\nu) < H(\mu)  2 \mu
\}.
\end{equation}
%
This function will be analyzed later.
\begin{theorem}
\label{thm:bayesinconsistent}
{\bf (Classification Inconsistency) \ } There exists an input space
${\cal X}$ and a countable set of classifiers ${\cal C}$ such that the
following holds: let $P$ be any prior satisfying (\ref{eq:priora}) and
(\ref{eq:priorb}).
%peter2 TODO CHECK DEFINITION
Then, for all $\mu \in (0,0.5)$,
\begin{description}
\item[(a)] For all $\mu' \in
[\mu,H(\mu)/2)$, there exists a $D$ with $\min_{c\in \cC} \erateD(c) =
\erateD(c_0) =
\mu$ such that, for all large $m$, with $a_m = 3 \exp( 2 \sqrt{m})$,
\begin{eqnarray}
\label{eq:batman}
\Pr_{S \sim D^m} \left( \emap = \mu' \right) & \geq & 1  a_m
\nonumber \\
\Pr_{S \sim D^m} \left(\esmap = \mu' \right) & \geq & 1  a_m
\nonumber \\
\Pr_{S \sim D^m} \left(\emdl = \mu' \right) & \geq & 1  a_m.
\end{eqnarray}
\item[(b)] If the prior $P$ further satisfies (\ref{eq:restrictedprior}), then for all $\mu' \in
[\mu,g(\mu))$, there exists a $D$ with $\min_{c\in \cC} \erateD(c) =
\erateD(c_0) =
\mu$ such that, for all large $m$, with $a_m = (6+o(1)) \exp((1 2 \mu) \sqrt{m})$,
\begin{equation}
\Pr_{S \sim D^m} \left(
\ebayes \geq \mu' \right) \geq 1 a_m.
\end{equation}
\end{description}
\end{theorem}
%peter2 changed statement of theorem to reflect changed result for
%full Bayes
%peter2 added some text
Since $H(\mu)/2 > \mu$ for all $\mu \in (0,0.5)$
(Figure~\ref{fig:allowed}), inconsistency of $\cmap$, $\csmap$ and
$\cmdl$ can occur for all $\mu \in (0,0.5)$.
%
The theorem states that Bayes is inconsistent for \emph{all large} $m$
on a fixed distribution $D$. This is a significantly more difficult
statement than ``for all (large) $m$, there exists a learning problem
where Bayes is inconsistent''.\footnote{In fact, a metaargument can be
made that \emph{any} nontrivial learning algorithm is `inconsistent'
in this sense for finite $m$.} Differentiation of $0.5 H(\mu) \mu$
shows that the maximum discrepancy between $\emap$ and $\mu$ is
achieved for $\mu = 1/5$. With this choice of $\mu$, $0.5 H(\mu)  \mu
= 0.1609\ldots$ so that, by choosing $\mu'$ arbitrarily close to
$H(\mu)$, the discrepancy $\mu'  \mu$ comes arbitrarily close to $
0.1609\ldots$. These findings are summarized in
Figure~\ref{fig:allowed}.
%peter2 added
Concerning $\cbayes$, since $H(\mu)  2\mu > 0$ for all $\mu \in
(0,0.5)$, $H(0) = 0$ and $H(\nu)$ is monotonically increasing between
$0$ and $0.5$, we have $g(\mu) > \mu$ for all $\mu \in (0, 0.5)$.
Hence, inconsistency can occur for all such $\mu$. Since $H(\nu)$ is
monotonically increasing in $\nu$, the largest value of $\nu$ can be
obtained for the $\mu$ for which $H(\mu)  2\mu$ is largest. We
already noted that this is maximized for $\mu=0.2$. Then the largest
$\nu$ such that $H(\nu) < H(\mu)  2 \mu = 2 \cdot 0.1609...$ can be
numerically calculated as $\nu_{\max} = 0.0586...$. Thus, in that case we can get $\erateD(\cbayes)$ arbitrarily close to $\mu + \nu_{\max} = 0.2586...$.\footnote{While we have no formal proof, we strongly suspect that $g(\mu)$
can be replaced by $H(\mu)/2$ in Part 2 of the theorem, so that the
suboptimality for $\cbayes$ would be just as large as for the other
three classifiers. For this reason we did not bother to draw the
function $g(\mu)$ in Figure~\ref{fig:allowed}.}
%peter 2 end of added part
How large can the discrepancy between $\mu = \inf_c \erateD(c)$ and
$\mu' = \erateD(\cbayes)$ be in the large $m$ limit, for general
learning problems? The next theorem, again summarized in
Figure~\ref{fig:allowed}, gives an upper bound which holds for all
learning problems (distributions $D$), namely, $\mu' < H(\mu)$:
\begin{theorem}{\bf \ (Maximal Inconsistency of Bayes)}
\label{thm:bayesbound}
Let $S^i$ be the sequence consisting of the first $i$ examples
$(x_1,y_1), \ldots, (x_i,y_i)$. For all priors $P$ nonzero on a set
of classifiers $\cC$,
for all learning problems $D$ with $0 < \inf_{c \in \cC} \erateD(c) = \mu< 0.5$,
for all $\delta >
0$, for all large $m$,
with $D^m$probability $\geq 1  2 \exp( 2 \sqrt{m})$,
$$
\frac{1}{m} \sum_{i=1}^m \bigl y_i  \cbayesi(x_i) \bigr \leq
H(\mu) + \delta.
$$
\end{theorem}
The theorem says that for large $m$, the total number of mistakes when
successively classifying $y_i$ given $x_i$ made by the Bayesian
algorithm based on $S^{i1}$, divided by $m$, is not
larger than $H(\mu)$. By the law of large numbers, it follows that
for large $m$,
$\erateD(\cbayesi(x_i))$, {\em averaged\/} over all $i$, is no larger than
$H(\mu)$. Thus, it is not ruled out that sporadically, for some
$i$, $\erateD(\cbayesi(x_i)) > H(\mu)$; but this must be `compensated'
for by most other $i$. We did not find a proof that $\erateD(\cbayesi(x_i)) < H(\mu)$
for {\em all\/} large $i$.
\begin{figure}[tb]
\includegraphics[angle=270]{full_allowed.ps}
\caption{\label{fig:allowed} A graph depicting the set of
asymptotically allowed error rates for different classification
algorithms. The $x$axis depicts the optimal classifier's error rate
$\mu$ (also shown as the straight line). The lower curve is just
$0.5 H(\mu)$ and the upper curve is $H(\mu)$.
Theorem~\ref{thm:bayesinconsistent} says that any $(\mu,\mu')$
between the straight line and
the lower curve can be achieved for some learning
problem $D$ and prior $P$. Theorem~\ref{thm:bayesbound} shows that
the Bayesian learner can never have asymptotic
error rate $\mu'$ above the upper curve.}
\end{figure}
\section{Proofs}
\label{sec:proofs}
\commentout{
%peter1 replaced by new, much 'cleaner' lemma; and moved to appendix. It seemed important to directly start with the most interesting and easiest part of the proof, namely 'the learning problem'
In this section we present the proofs of the three theorems.
Theorem~\ref{thm:bayesinconsistent} and~\ref{thm:bayesbound} both make
use of the following lemma:
\begin{lemma}
\label{pro:basicbayes}
There exists $\gamma > 0$ such that
for all classifiers $c$, $\alpha > 0$, $m > 0$, all $S \sim D^m$
satisfying $\alpha + 1/\sqrt{m} < \hat{e}_S(c) \leq 0.5$, all priors
satisfying (\ref{eq:priorb}): \\
\begin{multline}
\label{eq:bayesbound}
%\ \ \ \ \ $\text{\ \ \ \ \ }
\log \frac{1}{P(y^m \mid x^m, c, \hat{e}_S(c))}
\leq \log \frac{1}{P(y^m \mid x^m, c)}
\leq \\
%$ \ \\
%\ \ \ \ \ \ \ \ \ \ $\text{\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ }
\log \frac{1}{P(y^m \mid x^m, c, \hat{e}_S(c))} + \frac{1}{2} \log {m} +
\frac{1}{2} \frac{1}{\alpha (1 \alpha)}
 \log \gamma.
%$ \
\end{multline}
\end{lemma}
\begin{proof}{\bf (sketch)\ }
For the first inequality, note
\begin{multline}
\log \frac{1}{P(y^m \mid x^m, c)} =
\log \frac{1}{\int P(y^m \mid x^m, c,\theta) P(\theta) d\theta}
\geq
\log \frac{1}{P(y^m \mid x^m, c, \hat{e}_S(c))}, \nonumber
\end{multline}
since the likelihood $P(y^m \mid x^m, c, \theta)$ is maximized
at $\theta = \hat{e}_S(c)$.
For the second inequality, note that
$$
%\begin{multline}
%\int_{\theta \in [0,1]}
\int_0^1
P(y^m \mid x^m, c,\theta) P(\theta) d \theta
\geq
%\int_{\theta: \theta  \hat{e}_S(c) \leq 1 /\sqrt{m} }
\int_{\hat{e}_S(c)  1/ \sqrt{m}}^{\hat{e}_S(c) + 1 /\sqrt{m}}
\exp \bigl( \log P(y^m \mid x^m,
c,\theta) + \log P(\theta) \bigr) d \theta.
%\nonumber
%\end{multline}
$$
We obtain (\ref{eq:bayesbound}) by expanding $ \log P(y^m \mid
x^m, c,\theta)$ around the maximum $\theta = \hat{e}_S(c)$ using a
secondorder Taylor approximation. See, \citep{BarronRY98} for further details.
\end{proof}
}
\subsection{Inconsistent Learning Algorithms:
Proof of Theorem~\ref{thm:bayesinconsistent}}
\label{sec:inconsistencyproof}
Below we first define the particular learning
problem that causes inconsistency. We then analyze the performance of the
algorithms on this learning problem.
\subsubsection{The Learning Problem}
\label{sec:learningproblem}
For given $\mu$ and $\mu' \geq \mu$, we construct a learning problem
and a set of classifiers ${\cal C} = \{ c_0, c_1, \ldots \}$ such that
$c_0$ is the `good' classifier with $\erateD(c_0) = \mu$ and $c_1,
c_2, \ldots$ are all `bad' classifiers with $\erateD(c_j) = \mu' \geq
\mu$. $x = x_0 x_1 \ldots \in {\cal X} = \{0,1\}^\infty$ consists of one binary feature per classifier, and the
classifiers simply output the value of their special feature. The
underlying distribution $D$ depends on two parameters $0 < \phard < 1$ and
$\eta \in [0,1/2)$. These are defined in terms of the $\mu$ and $\mu'$ mentioned in the theorem, in a way to be described later.
%peter2 removed in terms of $\mu$ and $\mu'$,
%peter2 added
To construct an example $(x,y)$, we first flip a fair coin to
determine $y$, so $y =1$ with probability $1/2$. We then flip a coin with bias
$\phard$ which determines if this is a ``hard''
example or an ``easy'' example. Based upon these two coin flips, for
$j \geq 1$, each $x_j$ is independently generated according to the following $2$ cases.
\begin{enumerate}
\item For a ``hard'' example, and for each classifier $c_j$
with $j \geq 1$, set $x_j = 1  y$ with probability $1/2$ and
$x_j = y$ otherwise. Thus, $x_1, x_2, \ldots$ becomes an infinite sequence of realizations of i.i.d. uniform Bernoulli random variables.
\item For an ``easy'' example, we flip a third coin $Z$ with bias
$\eta$. If $Z=0$ (`example ok'), we set, for every $j \geq 1$, $x_j
= y$. If $Z=1 $, we set, for all $j \geq 1$, $x_j = 1 y$
otherwise. Note that for an ``easy'' example, all bad classifiers make
the same prediction.
\end{enumerate}
The bad classifiers essentially predict $Y$ by random guessing on hard
examples. On easy examples, they all make the same prediction, which
is correct with probability $1 \eta > 0.5$. It remains to define the
input $x_0$ of the ``good'' classifier $c_0$ with true error rate
$\mu$. This is done simply by setting $x_0 = 1 y$ with probability
$\mu$ and $x_0 = y$ otherwise.
The setting of $\phard$ and $\eta$ is different for, on the one hand,
the (S)MAP and MDL inconsistency proofs, and on the other hand, the
full Bayes inconsistency proof. To get a
first, intuitive idea of the proof, it is best to ignore, for the time
being, the parameter values for the full Bayes proof.
In the MAP and MDL inconsistency proof, we set $\phard := 2\mu'$ and
$\eta = 0$. In the Bayes proof, we first set $\phard := 2 \mu$. We
then define $\nu:= \mu'  \mu$ and we set $\eta := \nu/(1 2 \mu)$. By
the conditions of the theorem, we have $0 < H(\nu) < H(\mu)  2\mu$.
Note that for such $\nu$, $H(\nu) < 1  2 \mu$ and therefore $2\nu
\leq 1 2\mu $ so that $\eta < 1/2$. As is easily checked in the
(S)MAP and MDL case, and somewhat harder to check in the full Bayes
case, the error rate of each `bad' classifier is $\erateD(c_j) = \mu'$
for all $j \geq 1$.
{\em Discussion\/} The inputs $x$ are infinitedimensional vectors.
Nevertheless, the Bayesian posterior can be arbitrarily well
approximated in finite time for any finite sample size \( m \) if we
order the features according to the prior of the associated
classifier. We need only consider features which have an associated
prior greater than \( \frac{1}{2^{m}} \) since the minus
loglikelihood of the data is always less than \( m + O(\log m) \)
bits. This follows because by (\ref{eq:priorb}) and
(\ref{eq:discreteprior}), the prior $P(\theta)$ puts sufficient mass
in a neighborhood of $\theta = 0.5$. For such $\theta$, the distribution
$p_{c,\theta}(yx)$ becomes uniform, independently of $c$.
The (constructive) proof of Theorem~\ref{thm:bayesinconsistent}
relies upon this problem, but it's worth
mentioning that other hard learning problems exist and that this hard learning problem can be
viewed in two other ways:
\begin{enumerate}
\item The input space has two bits (the hardness bit and the ``correct
value'' bit) and the classifiers are stochastic. Stochastic
classifiers might be reasonable (for example) if the task is inferring
which of several stock ``experts'' are the best on average. The stock
pickers occasionally make mistakes as modeled by the stochasticity.
\item The input space consists of one real valued feature. Each bit
in the binary expansion of the real number is used by a different
classifier as above.
\end{enumerate}
\subsubsection{Bayes and MDL are inconsistent}
We now prove Theorem~\ref{thm:bayesinconsistent}. Stage 1 and 2
do not depend on the specific choices for $\phard$ and
$\eta$, and are common to the proofs for MAP, SMP, MDL and full Bayes.
In Stage 1 we show that for some function $k(m)$, for every
value of $m$, with probability converging to 1, there exists
some `bad' classifier $c^* = c_j$ with $0 < j \leq k(m)$ that has $0$
empirical error on hard examples, whereas the good classifier has empirical error close to its expected generalization error. Up to sublinear terms, we find that
\begin{equation}
\label{eq:delft}
\log k(m) = {m\phard}.
\end{equation}
The precise expression is given in (\ref{eq:montventoux}).
In Stage 2 we rewrite the log posterior odds ratio between the good
classifier $c_0$ and $c^*$. Up to sublinear terms (see
(\ref{eq:calvados})), this ratio turns out to be equal to
\begin{equation}
\label{eq:leiden}
m H(\erateS(c^*))  m H(\erateS(c_0)) + m \phard.
\end{equation}
In Stage 3 we combine (\ref{eq:delft}) and (\ref{eq:leiden}) to show
that, with the choice $\phard = 2 \mu'$, $\eta = 0$, the posterior on
$c^*$ becomes exponentially larger than the posterior on $c_0$, from
which inconsistency of MAP, SMP and MDL readily follows. In Stage 4,
we show that with the choice $\phard = 2 \mu$, $\eta = \nu/(12 \mu)$, the
posterior on $c^*$ still becomes exponentially larger than the
posterior on $c_0$, but now additionally, the classification
performance of the Bayesian classifier (a mixture that puts nearly all
its weight on bad classifiers), cannot exceed that of $c^*$.
\paragraph{Stage 1}
Let $\mhard$ denote the number of hard examples generated within a
sample $S$ of size $m$.
%peter2 added
Let $\erateShard(c)$ be the number of mistakes that the classifier $c$
makes on the subset $\Shard$
of $S$ of hard examples, divided by $\mhard = \Shard$.
%
Let $k$ be a positive integer and $\cC_k =
\{c_j \in \cC: 1 \leq j \leq k \}$. For all $\epsilon > 0$
and $m \geq 0$, we have:
\begin{eqnarray}
\lefteqn{\Pr_{S \sim D^m}(\forall c \in \cC_k: \ \erateShard(c) > 0)} & &
\nonumber \\
& \overset{(a)}{=} &
\Pr_{S \sim D^m} \left( \forall c \in \cC_k: \ \erateShard(c) > 0
\mid \frac{\mhard}{m} > \phard + \epsilon \right)
\Pr_{S\sim D^m} \left( \frac{\mhard}{m} > \phard + \epsilon \right)
\nonumber \\
& + & \Pr_{S \sim D^m} \left( \forall c \in \cC_k: \ \erateShard(c) > 0
\mid \frac{\mhard}{m} \leq \phard + \epsilon \right)
\Pr_{S \sim D^m} \left( \frac{\mhard}{m} \leq \phard + \epsilon
\right)
\nonumber \\
& \overset{(b)}{\leq} & e^{2m \epsilon^2} +
\Pr_{S \sim D^m}\left( \forall c \in \cC_k: \ \erateShard(c) > 0 \mid
\frac{\mhard}{m} \leq \phard + \epsilon \right)
\nonumber \\
\label{eq:monday}
& \overset{(c)}{\leq} & e^{2m \epsilon^2} +
(1  2^{m \left( \phard + \epsilon \right)})^k
%\nonumber \\
%&
\overset{(d)}{\leq}
%&
e^{2m \epsilon^2} + e^{k 2^{ m\left( \phard +
\epsilon \right)}}.
\end{eqnarray}
Here (a) follows because $P(a)= \sum_b P(ab)P(b)$.
(b) follows by $\forall a,P:\,\,P(a) \leq 1$ and the Chernoff bound.
(c) holds from independence and
since $(1  2^{m(\phard + \epsilon)})^k$ is monotonic in
$\epsilon$, and (d) by
$\forall x \in [0,1], k>0:\,\,\, (1x)^k \leq e^{kx}$.
We now set $\epsilon_m := m^{0.25}$ and
\begin{equation}
\label{eq:montventoux}
k =
k(m) = \frac{2m \epsilon_m^2}{2^{m\left( \phard + \epsilon_m \right)}}.
\end{equation}
Note that, up to sublinear terms, this is equal to (\ref{eq:delft}).
With (\ref{eq:montventoux}),
(\ref{eq:monday}) becomes
\begin{equation}
\label{eq:friday}
\Pr_{S \sim D^m}(\forall c \in \cC_{k(m)}:
\ \erateShard(c) > 0) \leq 2 e^{2\sqrt{m}}
\end{equation}
On the other hand, by the Chernoff bound we have
%\begin{eqruation}
%\label{eq:sprite}
$
\Pr_{S \sim D^m}(\erateS(c_0) < \erateD(c_0)  \epsilon_m) \leq
e^{2 \sqrt{m}}
$
%\end{eqruation}
for the optimal classifier $c_0$.
Combining this
with (\ref{eq:friday}) using the union bound, we get that, with
$D^m$probability larger than $1 3 e^{2\sqrt{m}}$, the following
event holds:
\begin{equation}
\label{eq:saturday}
\exists c \in \cC_{k(m)}:
\ \erateShard(c) = 0 \text{\ and \ } \erateS(c_0) \geq \erateD(c_0) 
\epsilon_m.
\end{equation}
\paragraph{Stage 2}
In this stage, we calculate, for large $m$, the log
ratio between the posterior on some $c^* \in \cC_{k(m)}$ with
$\erateShard(c^*) = 0$ and the posterior on $c_0$. We have:
\begin{multline}
\label{eq:odds}
\log \frac{\max_{\theta}
P(c_0, \theta \mid x^m,y^m)}{\max_{\theta} P(c^*, \theta \mid x^m,y^m)} =
\log
\frac{\max_{\theta} P(c_0) P(\theta) P(y^m \mid x^m, c_0, \theta)}
{\max_{\theta} P(c^*) P(\theta) P(y^m \mid x^m, c^*, \theta)}
= \\
\log \max_{\theta} P(c_0) P(\theta) P(y^m \mid x^m, c_0, \theta)

\log \max_{\theta} P(c^*) P(\theta) P(y^m \mid x^m, c^*, \theta).
\end{multline}
Using (\ref{eq:safe}), (\ref{eq:priorb}) and (\ref{eq:discreteprior}),
we see that, uniformly for all samples $S$ with $\erateS(c_0) < 1/2$,
the leftmost term is no larger than
\begin{multline}
\label{eq:reftie}
\log \ \bigl( \max_{\theta} P(c_0) P(\theta) \bigr) \cdot
\bigl(\max_{\theta'} P(y^m \mid x^m, c_0, {\theta'}) \bigr)
=  m H(\erateS(c_0)) + O(1).
\end{multline}
Similarly, uniformly for all samples $S$ with $\erateShard(c^*) = 0,
\erateS(c^*) < 1/2$,
the rightmost term in (\ref{eq:odds}) satisfies
\begin{equation}
\label{eq:rightie}

\log \max_{\theta} P(c^*) P(\theta) P(y^m \mid x^m, c^*, \theta) \leq
 \log P(c^*) + m H(\erateS(c^*)) +O(1),
\end{equation}
where the constant in the $O$notation does not depend on $c^*$.
%TODO WHY DOES MARCUS ASK?
Using
condition (\ref{eq:priora}) on prior $P(c^*)$ and using
$c^* \in \cC_{k(m)}$, we find:
\begin{equation}
\label{eq:priorc}
 \log P(c^*) = \log \frac{1}{P(c^*)} \leq \log k(m) + o ( \log k(m)),
\end{equation}
where $\log k(m) = \log {2}{\sqrt{m}} +
m \phard + m^{0.75}$, so that
\begin{equation}
\label{eq:key}
\log
\frac{1}{P(c^*)} \leq m \phard + o(m)
\end{equation}
which implies that (\ref{eq:rightie}), is no larger than $ m \phard +
m H(\erateS(c^*))+
o(m)$. Thus, for
all large $m$, the difference between the leftmost term
and the rightmost term in
(\ref{eq:odds}) satisfies
\begin{equation}
\label{eq:calvados}
\log \frac{\max_{\theta}
P(c_0, \theta \mid x^m,y^m)}{\max_{\theta} P(c^*, \theta \mid x^m,y^m)} \leq
 m H(\erateS(c_0)) + m \phard + mH(\erateS(c^*)) + o(m),
\end{equation}
as long as $\erateS(c_0)$ and $\erateS(c^*)$ are both less than $0.5$.
\paragraph{Stage 3(a)} {\bf (MAP)}
Recall that, for the MAP result, we set $\eta := 0$ and $\phard := 2
\mu'$. Let us assume that the large probability
event (\ref{eq:saturday}) holds. This will allow us to replace the two
`empirical entropies' in (\ref{eq:calvados}), which are random
variables, by corresponding ordinary entropies, which are constants.
By (\ref{eq:saturday}), $\erateShard(c^*) = 0$, so that
we have (since $\eta = 0$) that $\erateS(c^*) = 0$ and then also
$H(\erateS(c^*)) = 0$. Because $H(\mu)$ is continuously differentiable
in a small enough neighborhood around $\mu$, by (\ref{eq:saturday}) we
also have, for some constant $K$,
$$
H(\erateS(c_0)) \geq H(\erateD(c_0))  K \epsilon_m + O(1)
= H(\mu) + O(m^{1/2}).$$
Plugging these expressions for $H(\erateS(c^*))$ and $H(\erateS(c_0))$
into (\ref{eq:calvados}), and using the fact that we set $\phard = 2
\mu'$, we see that, as long as $\mu' < H(\mu)/2$, there exists a $c >
0$ such that for all large $m$, (\ref{eq:calvados}), and hence (\ref{eq:odds}) is smaller than $
cm$.
Thus, (\ref{eq:odds}) is less than $0$ for large $m$, implying that
then $\emap = \mu'$.
We
derived all this from (\ref{eq:saturday}) which holds with
probability $\geq 1  3 \exp( 2 \sqrt{m})$. Thus,
for all large $m$,
$ \Pr_{S \sim D^m} \left( \emap = \mu' \right) \geq 1
3 \exp( 2 \sqrt{m}),$
and the result follows.
\paragraph{Stage 3(b)} {\bf (SMP)}
We are now interested in evaluating, instead of the posterior ratio (\ref{eq:odds}), the posterior ratio with the error rate
parameters integrated out:
\begin{equation}
\label{eq:smapodds}
\log \frac{
P(c_0 \mid x^m,y^m)}{P(c^* \mid x^m,y^m)} =
\log P(c_0) P(y^m \mid x^m, c_0)

\log P(c^*) P(y^m \mid x^m, c^*).
\end{equation}
By Proposition~\ref{prop:uniboem} in the appendix, we see that, if
(\ref{eq:saturday}) holds, then (\ref{eq:odds}) is no larger than
(\ref{eq:smapodds}) plus an additional term of order $O(\log m)$. To see this, apply the first inequality of (\ref{eq:bayesbound})
to the term involving $c_0$, and the second inequality of
(\ref{eq:bayesbound}) to the term involving $c^*$. The result now
follows by exactly the same reasoning as in Stage 3(a).
%We already showed in
%Stage 3(a) that, (b) if (\ref{eq:saturday}) holds, then, as long as $\mu' <
%H(\mu)/2$, there exists a $c > 0$ such that for all large $c$,
%(\ref{eq:odds}) is smaller than $ cm$. Since (\ref{eq:saturday}) holds with
%probability $\geq 1  3 \exp( 2 \sqrt{m})$, the result follows.
\paragraph{Stage 3(c)} {\bf (MDL)}
By part (1) of Proposition~\ref{prop:uniboem} in the appendix, the MDL
procedure is equal to SMP with the uniform prior $w(\theta) \equiv 1$.
Thus, the MDL case is a special case of the SMP case for which we
already proved inconsistency above.
\paragraph{Stage 3(d)} {\bf (Bayes)}
%peter2 rewrote completely
In order to prove the inconsistency for the full Bayesian classifier,
we construct a setup where on on hard examples, all classifiers, even
the `good' classifier $c_0$, predict $c(X)=1$ with probability 1/2,
independently of the true value of $Y$. To this end, we refine our
learning problem by setting $x_0 = 1y$ with probability 1/2 for a
``hard'' example, and $x_0 = y$ with probability 1 for an ``easy''
example. By setting $\phard := 2\mu$, we still get that $\erateD(c_0)
= \mu$. In order to make the error rate for the bad classifiers $c_1,
c_2, \ldots$ still larger than for $c_0$, we now set $\eta$ to a value
larger strictly than $0$.
We let $\erateSeasy(c)$ denote the empirical error that classifier $c$
achieves on the easy examples in $S$, i.e. the number of mistakes on
the easy examples in $S$ divided by $S  \Shard$. Now set
$\epsilon_m$ as in Stage 1 and define the events (sets of samples
$S^m$ of length $m$) ${\cal A}$ and ${\cal B}$ as
%\begin{eqarray}
$${\cal A} = \{S^m \; : \; \exists j > 0 :
 \erateSeasy(c_j)  \eta  > \epsilon_m \}
%\nonumber \\
\ ; \
{\cal B} = \{ S^m \; : \; \frac{\mhard}{m} \geq \phard + \epsilon_m \},
$$
%\nonumber
%\end{eqnarray}
and let ${\cal A}^c$ and ${\cal B}^c$ be their respective complements.
We have
\begin{multline}
\Pr_{S \sim D^m}\left( {\cal A} \cup {\cal B} \right) = \\
\Pr \left( {\cal A} \cup {\cal B} \mid
{\cal B} \right) \Pr\left({\cal B} \right) +
\Pr \left( {\cal A} \cup {\cal B} \mid
{\cal B}^c \right) \Pr\left({\cal B}^c \right)
\leq \Pr\left({\cal B} \right) +
\Pr \left( {\cal A} \mid
{\cal B}^c \right) \\ \leq
e^{ 2 \sqrt{m}} + 2 e^{ 2 m (1 \phard  \epsilon_m) \epsilon_m^2} =
e^{ 2 \sqrt{m}} + 2 e^{ 2 \sqrt{m}(1 \phard) + 2 m^{0.25}}
\leq \\
3 e^{ (1 \phard) \sqrt{m}} (1+ o(1)),
\end{multline}
where the second inequality follows by applying the Chernoff bound to
both terms (recall that on easy examples, all classifiers $c_j$ with $j > 1$ output the same prediction for $Y$). Combining this with the result of Stage 1 using the union
bound and using $\phard = 2 \mu$, we get that, with $D^m$probability
at least $1 6 (1+ o(1))e^{\sqrt{m}(1 2 \mu)}$, (\ref{eq:saturday})
and ${\cal A}^c$ and ${\cal B}^c$ all hold at the same time. Let us
assume for now that this large probability
event holds. We must then have that some $c^* \in {\cal C}_{k(m)}$ achieves empirical error $0$
on hard examples (which occur with probability $2 \mu$) and at least
$\eta + O(m^{1/2})$ on easy examples (which occur with probability $1 2
\mu$), so that
\begin{equation}
\label{eq:nyons}
\erateS(c^*) = (1 2\mu) \eta + O(m^{1/2}) = \nu + O(m^{1/2}),
\end{equation}
where $\nu =
\mu'  \mu$. By continuity of $H$, we also have that $H(\erateS(c^*))
= H(\nu) + O(m^{1/2})$.
Entirely analogously to the reasoning in Stage 3(a), we can now replace the
empirical entropies in the expression (\ref{eq:calvados}) for the
loglikelihood ratio between $c^*$ and $c_0$ by the corresponding
ordinary entropies. This gives
\begin{equation}
\label{eq:calvadosb}
\log \frac{\max_{\theta}
P(c_0, \theta \mid x^m,y^m)}{\max_{\theta} P(c^*, \theta \mid x^m,y^m)} \leq
 m H(\mu) + m 2 \mu + mH(\nu) + o(m),
\end{equation}
By definition of $\nu$, $\nu = \mu'  \mu$ satisfies $H(\mu) + H(\nu) + 2\mu <
0$, so that there exists a $c > 0$ such that for all large $m$,
(\ref{eq:calvadosb}) is smaller than $ c m$.
Reasoning entirely analogously to Stage 3(b), we see that
(\ref{eq:calvadosb}) still holds if we integrate out $\theta$, rather than
maximize over it: there exists a $c > 0$ such that for all
large $m$,
\begin{equation}
\label{eq:calvadosc}
\log \frac{
P(c_0 \mid x^m,y^m)}{P(c^* \mid x^m,y^m)} \leq
 m H(\mu) + m 2 \mu + mH(\nu) + o(m) \leq  cm.
\end{equation}
Furthermore, by (\ref{eq:nyons}) and our condition on the prior,
the posterior on $\theta$ given $c^*$ must concentrate on $\nu$ (even
though $c^*$ varies with $m$):
we must have that, for every open set $A$ containing
$\nu$, the posterior distribution of $\theta$ given $c^*$ and sample
$S$ satisfies
\begin{equation}
\label{eq:calvadosd}
P(\theta \in A \mid c^*, S) \overset{m \rightarrow
\infty}{\rightarrow} 1.
\end{equation}
We now show that (\ref{eq:calvadosc}) and (\ref{eq:calvadosd}), both
of which hold with high probability, imply that the full Bayesian
classifier based on sample $S$ errs with probability at least
$\mu + \nu = \mu'$:
\begin{multline}
\label{eq:wagen}
\Pr_{X,Y \sim D}(Y \neq \cbayes(X)) =
\Pr_{X,Y \sim D}(Y \neq \cbayes(X) \mid \text{Ex. hard}) \phard + \\
\Pr_{X,Y \sim D}(Y \neq \cbayes(X) \mid \text{Ex. easy}) (1 \phard)
\geq \\
\frac{1}{2} 2 \mu + \Pr_{X,Y \sim D}(Y \neq \cbayes(X) \mid
\text{Ex. easy}) (1  2 \mu) \geq \\
\mu + \Pr_{X,Y \sim D}(Y \neq \cbayes(X) \mid
\text{Ex. easy, corrupted}) (1 2\mu) \eta = \\
\mu + \Pr_{X,Y \sim D}(Y \neq \cbayes(X) \mid
\text{Ex. easy, corrupted}, Y=1) (1 2\mu) \eta.
\end{multline}
Here the first inequality follows by symmetry: on hard examples, $Y=1$
with probability $1/2$ and all classifiers independently output $Y=1$
with probability $1/2$. The final equality follows again by symmetry
between the case $Y=1$ and $Y=0$. Depending on the sample $S$, the probability in the final line
of (\ref{eq:wagen}) is either equal to $1$ or to $0$. It is equal to 1
if $\cbayes(X) = 0$. By (\ref{eq:bayesact}), a sufficient condition
for this to happen is if $S$ is such that
\begin{equation}
\label{eq:maker}
E_{{c,\theta} \sim P(\cdot \mid S)} \/[ p_{c,\theta}(Y =
1X=x) ] <
\frac{1}{2}.
\end{equation}
This expectation can be rewritten as
\begin{multline}
\label{eq:huiler}
\sum_{c \in \cC} \int_{\theta \in [0,0.5)} P(\theta \mid c, S)
P( c \mid S) p_{c,\theta} (Y =1X=x) d \theta = \\
\sum_{c \in \cC}P( c \mid S) u(c,S,x) = \\
P(c_0 \mid S) u(c_0 ,S,x) + P(c^* \mid S) u(c^* ,S,x)
+ \sum_{c \not \in \{c_0,c^*\} } P( c \mid S) u(c,S,x),
\end{multline}
where
$
u(c,S,x) :=
\int_{\theta \in [0,0.5)}
P(\theta \mid c, S)
p_{c,\theta} (Y = 1X=x) d \theta.
$
Note that we integrate here over $[0,0.5)$, reflecting the extra
condition (\ref{eq:restrictedprior}) that we required for the full
Bayesian result. Since the example in the final line (\ref{eq:wagen}) is corrupted,
for the $x$ occurring there we have that $c_j(x) = 0$ for $j \geq 1$,
so that $p_{c_j,\theta}(Y=1 \mid X = x) < 1/2$ for all $\theta <
1/2$. It follows that for this $x$, (\ref{eq:huiler}) is no greater than
$$
P(c_0 \mid S) + P(c^* \mid S) u(c^* ,S,x) + \sum_{c \not \in
\{c_0,c^*\} } P( c \mid S) \frac{1}{2}.
$$
By (\ref{eq:calvadosc}) and (\ref{eq:calvadosd}), for all $\delta >
0$, for all large $m$ this is no greater
than
\begin{equation}
\label{eq:isere}
a(e^{ c m} \cdot 1 + (1 e^{ cm}) (\nu + \delta)) + (1a) \frac{1}{2}.
\end{equation}
for some $a$ that may depend on $m$ but that satisfies $0 < a < 1$ for all $m$. Therefore, and since $\nu < 1/2$, (\ref{eq:isere}) is less than $1/2$ for all large $m$. But this implies (by the reasoning above
(\ref{eq:maker})) that $\cbayes(x) =0$. It follows by (\ref{eq:wagen})
that, for large $m$,
$$
\Pr_{X,Y \sim D}(Y \neq \cbayes(X)) \geq \mu + (1 2 \mu) \eta =
\mu + \nu = \mu'.
$$
All this is implied under an event that holds with probability at
least $1 6 (1+ o(1))e^{\sqrt{m}(1 2 \mu)}$ (see above), so that
the result follows.
\subsection{A Consistent Algorithm: Proof of Theorem~\ref{thm:consistent}}
\label{sec:orb}
In order to prove the theorem, we first state
the Occam's Razor Bound classification algorithm, based on minimizing the bound given by the following theorem.
\begin{theorem} (Occam's Razor Bound) {\rm \citep{BlumerEHW87}}
\label{thm:orb}
For all priors $P$ on a countable set of classifiers $\cC$,
for all distributions $D$, with
probability $1\delta$:
$$ \forall c:\,\,\,\,\erateD(c) \leq \erateS(c) + \sqrt{\frac{\ln \frac{1}{P(c)} + \ln \frac{1}{\delta}}{2m}}.$$
\end{theorem}
The algorithm stated here is in a suboptimal form, which is good
enough for our purposes (see \citep{McAllester99} for more
sophisticated versions):
$$ \corb := \arg \min_{c \in \cC} \ \left\{
\erateS(c) + \sqrt{\frac{\ln
\frac{1}{P(c)} +
\ln {m}}{2m}}\ \right\}.$$
\paragraph{Proof of Theorem~\ref{thm:consistent}}
Set $\delta_m := 1/m$. It is easy to see that
$$
\min_{c \in \cC} \erateD(c) + \sqrt{\frac{\ln
\frac{1}{P(c)} +
\ln {m}}{2m}}
$$
is achieved for at least one $c \in \cC = \{ c_0, c_1, \ldots
\}$. Among all $c_j \in \cC$ achieving the minimum, let
$\tilde{c}_{m}$ be the one with smallest index $j$. By the Chernoff
bound, we have with probability at least $1
\delta_m = 1 1/m$,
\begin{equation}
\label{eq:chernoccam}
\erateD(\tilde{c}_{m}) \geq \erateS(\tilde{c}_{m}) 
\sqrt{\frac{\ln (1/\delta_m)}{2m}} =
\erateS(\tilde{c}_{m}) 
\sqrt{\frac{\ln m}{2m}},
\end{equation}
whereas by Theorem~\ref{thm:orb}, with probability at least
$1  \delta_m = 1 1/m$,
%\begin{eqruation}
%\label{eq:occhern}
\begin{multline}
\erateD(\corb) \leq \min_{c \in \cC}
\erateS(c) + \sqrt{\frac{ \ln
{P(c)} +
\ln {m}}{2m}} \leq \\ \erateS(\tilde{c}_{m}) +
\sqrt{\frac{ \ln {P(\tilde{c}_{m})} +
\ln {m}}{2m}}. \nonumber
\end{multline}
%\end{eqruation}
Combining this with (\ref{eq:chernoccam}) using the union bound, we find that
$$\erateD(\corb) \leq \erateD(\tilde{c}_{m}) + \sqrt{\frac{ \ln
{P(\tilde{c}_{m})} +
\ln {m}}{2m}} + \sqrt{\frac{\ln m}{2m}},$$
with probability at least $1  2/m$.
The theorem follows upon noting that the righthand side of this
expression converges to $\inf_{c \in \cC} \erateD(c)$ with increasing $m$.
\subsection{Proof of Corollary~\ref{cor:inconsistent}}
%peter1 added a lot here
The corollary relies on Theorem~\ref{thm:consistent} and a slight
generalization of the proof of Theorem \ref{thm:bayesinconsistent}.
For Theorem \ref{thm:consistent} pick $K < 0.05$.
In Theorem \ref{thm:bayesinconsistent} choose $\mu =
1/5$, $\mu' = 1/5 + .15$. Now part 1 follows.
For part 2, consider Theorem \ref{thm:bayesinconsistent} with the
same $\mu$ and $\mu'$. From the theorem
we see that for the learning problem for which
(\ref{eq:batman}) holds, $c_0$ is the optimal
classifier. Denote this learning problem by $D_0$. We define $D_j$ as
the learning problem (distribution) in the proof (see
Section~\ref{sec:learningproblem}), but with the role of $x_0$ and $x_j$
interchanged. As a result, $c_j$ will be the `good' classifier with
error rate $\mu$ and $c_0$ will be one of the bad classifiers with
rate $\mu'$. Then the good classifier and one of the bad
classifiers will have a different prior probability,
but otherwise nothing changes. Since the proof of Theorem
\ref{thm:bayesinconsistent} does not depend on the prior probability
of the good classifier  it can be as large or small as we like as
long as it is greater than $0$ , the proof goes through unchanged, and for all
learning problems $D_j$, (\ref{eq:batman}) will hold.
We now generate a learning problem $D_j$ by first sampling a
classifier $c_j$ according to $P(c)$, and then generating data
according to $D_j$. Then, no matter what $c_j$ we chose,
it will be the optimal (`good')
classifier, and, as we just showed, (\ref{eq:batman}) will
hold. Theorem~\ref{thm:consistent} (with $K < 0.05$) can now be
applied with $D = D_j$, and the result follows.
\subsection{Proof of Theorem~\ref{thm:bayesbound}}
%\enlargethispage{\baselineskip}
Without loss of generality assume that $c_0$ achieves $\min_{c \in \cC} \erateD(c)$. Consider both the $0/1$loss and the log loss of sequentially
predicting with the Bayes predictive distribution $P(Y_i = \cdot
\mid X_i = \cdot , S^{i1})$ given by
$P(y_i \mid x_i, S^{i1}) =
E_{c,\theta \sim P(\cdot \mid S^{i1})} \/ p_{c,\theta}(y_ix_i).$
Every time $i \in
\{1, \ldots, m \}$ that the Bayes classifier based on $S^{i1}$
classifies $y_i$ incorrectly,
$P(y_i \mid x_i, S^{i1})$ must be $\leq 1/2$ so that
$  \log P(y_i \mid x_i, S^{i1}) \geq 1$.
Therefore, if for some $\alpha > 0$, $\erateS(c_0) < 0.5  \alpha$, then
\begin{equation}
\label{eq:champagne}
\sum_{i=1}^m  \log P(y_i \mid x_i, S^{i1}) \geq
\sum_{i=1}^m  y_i  \cbayesi(x_i).
\end{equation}
On the other hand we have
\begin{multline}
\label{eq:coffee}
\sum_{i=1}^m \log P(y_i \mid x_i, S^{i1}) =
\log \prod_{i=1}^m P(y_i \mid x_i, x^{i1},y^{i1}) = \\
\log \prod_{i=1}^m P(y_i \mid x^m, y^{i1}) =
 \log \prod_{i=1}^m \frac{P(y^i  x^m)}{P(y^{i1} x^{m})} {=}
 \log P(y^m \mid x^m) = \\
 \log \sum_{j = 0,1,2 \ldots} P(y^m \mid x^m, c_j)
P(c_j) \overset{(a)}{\leq} \\
 \log P(y^m \mid x^m, c_0)  \log P(c_0) \overset{(b)}{\leq}
m H(\erateS(c_0)) + O(\log m),
\end{multline}
where the constant in the $O(\log m)$ term may depend on $\alpha$.
Here inequality (a) follows because a sum is larger than each of its
terms, and (b) follows by Proposition~\ref{prop:uniboem} in the appendix. By the Chernoff bound, for all small enough $\epsilon > 0$,
with probability larger than $1  2 \exp(2 m \epsilon^2)$, we have $
\erateS(c_0)  \erateD(c_0)  < \epsilon$. We now set $\epsilon_m =
m^{0.25}$. Using the fact that $H(\mu)$ in (\ref{eq:coffee}) is continuously
differentiable in a neighborhood of $\mu$ and $\mu < 1/2$,
it follows that with probability larger than $1 2 \exp( 2 \sqrt{m})$,
for all large $m$,
\begin{equation}
\label{eq:tea}
\sum_{i=1}^m \log P(y_i \mid x_i, S^{i1}) \leq
m H(\erateD(c_0)) + Km^{0.75} + O(\log m),
\end{equation}
where $K$ is a constant not depending on $m$.
Combining (\ref{eq:tea}) with (\ref{eq:champagne})
we find that with probability $\geq 1  2 \exp( 2 \sqrt{m})$,
$
\sum_{i=1}^m  y_i  \cbayesi(x_i) \leq
m H(\erateD(c_0)) + o(m)$, which is what we had to prove.
\section{Technical Discussion}
\subsection{Variations of Theorem~\ref{thm:bayesinconsistent} and
dependency on the prior}
\label{sec:technical}
%\enlargethispage{\baselineskip}
\paragraph{Prior on classifiers}
The requirement (\ref{eq:priora}) that
$ \log P(c_k) \geq \log k + o ( \log k)$
is needed to obtain (\ref{eq:key}), which is the key inequality in the proof of
Theorem~\ref{thm:bayesinconsistent}. If $P(c_k)$ decreases at polynomial rate,
but at a degree $d$ larger than one, i.e. if
\begin{equation}
\label{eq:dtail}
\log P(c_k) = d \log k + o ( \log k),
\end{equation}
then a variation of Theorem~\ref{thm:bayesinconsistent} still applies
but the maximum possible discrepancies between $\mu$ and $\mu'$ become
much smaller: essentially, if we require $\mu \leq \mu' < \frac{1}{2d}
H(\mu)$ rather than $\mu \leq \mu' < \frac{1}{2} H(\mu)$ as in
Theorem~\ref{thm:bayesinconsistent}, then the argument works for all
priors satisfying (\ref{eq:dtail}). Since the derivative $d H(\mu) /
d\mu \rightarrow \infty$ as $\mu \downarrow 0$, by setting $\mu$ close
enough to $0$ it is possible to obtain inconsistency for any fixed
polynomial degree of decrease $d$. However, the higher $d$, the smaller $\mu =
\inf_{c \in \cC} \erateD(c)$ must be to get any inconsistency with this
argument.
\paragraph{Prior on error rates}
Condition (\ref{eq:priorb}) on the prior on the error rates is
satisfied for most reasonable
priors.
Some approaches to applying MDL to classification problems amount to
assuming priors of the form $p(\theta^*) = 1$ for a single $\theta^*
\in [0,1]$ (Section~\ref{sec:mdl}). In that case, we can still
prove a version of Theorem~\ref{thm:bayesinconsistent}, but the
maximum discrepancy between $\mu$ and $\mu'$ may now be either larger
or smaller than $H(\mu)/2  \mu$, depending on the choice of
$\theta^*$.
\subsection{Properties of the transformation
from classifiers to distributions}
\label{sec:trafo}
\paragraph{Optimality and Reliability}
Assume that the conditional distribution of $y$ given $x$ according to the
`true' underlying distribution $D$ is defined for all $x \in \cX$, and let $p_D(yx)$ denote its mass function.
Define $\Delta(p_{c,\theta})$ as the KullbackLeibler (KL) divergence \citep{CoverT91} between
$p_{c,\theta}$ and the `true' conditional distribution $p_D$:
$$
\Delta(p_{c,\theta}) :=
\KL(p_D \ p_{c,\theta}) =
E_{(x,y) \sim D} [  \log p_{c,\theta}(yx) + \log p_{D}(yx)],
$$
and note that for each fixed $c$,
$\min_{\theta \in
[0,1]} \Delta(p_{c,\theta})$ is uniquely achieved for $\theta =
\erateD(c)$ (this follows by differentiation) and satisfies
\begin{equation}
\label{eq:turnaround}
\min_{\theta} \Delta(p_{c,\theta}) = \Delta(p_{c,\erateD(c)}) =
H(\erateD(c))  K_D,
\end{equation}
where $K_D = E [\log p_{D}(yx)]$ does not
depend on $c$ or $\theta$, and $H(\mu)$ is the binary entropy.
\begin{proposition}
\label{pro:trafo}
Let $\cC$ be any set of classifiers, and let $c^*
\in \cC$ achieve \\ $\min_{c \in \cC} \erateD(c)$ $= \erateD(c^*)$.
\begin{enumerate}
\item If $\erateD(c^*) < 1/2$, then
$$\min_{c,\theta} \Delta(p_{c,\theta}) \text{\ is uniquely achieved for
\ } (c,\theta) = (c^*,\erateD(c^*)).$$
\item $\min_{c,\theta} \Delta(p_{c,\theta}) = 0$ iff $p_{c^*,\erateD(c^*)}$ is `true', i.e. if
$\forall x, y: \/ p_{c^*,\erateD(c^*)}(yx) = p_D(yx)$.
\end{enumerate}
\end{proposition}
\begin{proof}
Property 1 follows from (\ref{eq:turnaround}) and the fact that
$H(\mu)$ is monotonically increasing for $\mu < 1/2$. Property 2 follows
directly from the information inequality \citep{CoverT91}, using the
fact that we assume $p_D(yx)$ to be welldefined for all $x$, which
implies that $X$ has a density $p_D(x)$ with $p_D(x) > 0$ for all $x$.
\end{proof}
Proposition~\ref{pro:trafo} implies that the transformation is a good
candidate for turning classifiers into probability distributions.
Namely, let $\cP = \{ p_\alpha : \alpha \in A \}$ be a set of i.i.d.
distributions indexed by parameter set $A$ and let $P(\alpha)$ be a
prior on $A$. By the law of large numbers, for each $\alpha \in A$,
$ m^{1} \log p_{\alpha}(y^m \mid x^m)P(\alpha)  K_D \rightarrow \KL(p_D \
p_{\alpha})$. By Bayes rule, this implies that if the class $\cP$ is
`small' enough so that the law of large numbers holds {\em
uniformly\/} for all $p_{\alpha} \in \cP$, then for all $\epsilon >
0$, the Bayesian posterior will concentrate, with probability 1, on
the set of distributions in $\cP$ within $\epsilon$ of the $p^* \in
\cP$ minimizing KLdivergence to $D$. In this case, if $\cC$ is
`simple' enough so that the corresponding $\cP = \{ p_{c,\theta} : c
\in \cC, \theta \in [0,1] \}$ admits uniform convergence
\citep{Grunwald98b}, then the Bayesian posterior asymptotically
concentrates on the $p_{c^*,\theta^*} \in \cP = \{ p_{c,\theta} \}$
closest to $D$ in KLdivergence. By Proposition~\ref{pro:trafo}, this
$p_{c^*,\theta^*}$ corresponds to the $c^* \in \cC$ with smallest
generalization error rate $\erateD(c^*)$ ($p_{c^*,\theta^*}$ is {\em
optimal\/} for $0/1$loss), and for the $\theta^* \in [0,1]$ with
$\theta^* = \erateD(c^*)$ ($p_{c^*,\theta^*}$ gives a {\em reliable\/}
impression of its prediction quality). This convergence to an optimal
and reliable $p_{c^*,\theta^*}$ will happen if, for example, $\cC$ has
finite VCdimension \citep{Grunwald98b}. We can only get trouble as in
Theorem~\ref{thm:bayesinconsistent} if we allow $\cC$ to be of
infinite VCdimension.
\paragraph{Analogy to Regression} In ordinary (realvalued)
regression, ${\cal Y } = {\mathbb R}$,
and one tries to learn a function $f \in {\cal F}$ from the data. Here ${\cal F}$ is
a set of candidate functions
${\cal X} \rightarrow {\cal Y}$. In order to apply Bayesian inference
to this problem, one assumes a probability model $\cP$ expressing
$Y = f(X) + Z$, where
$Z$ is independent noise with mean $0$ and variance $\sigma^2$. $\cP$ then
consists of conditional density functions $p_{f,\sigma^2}$, one for each $f \in {\cal F}$ and $\sigma^2 > 0$. It is
well known that if one assumes $Z$ to be normally distributed
independently of $X$, then the $p_{f,\sigma^2}$ become Gaussian densities and
the log likelihood becomes {\em a linear function of
the mean squared error\/} \citep{Rissanen89}:
\begin{equation}
\label{eq:gauss}
 \ln p_{f,\sigma^2}(y^n \mid x^n) = \beta_{\sigma} \sum_{i=1}^n (y_i  f(x_i))^2 +
n \ln Z(\beta_{\sigma}).
\end{equation}
where we wrote $\beta_{\sigma} = 1/2 \sigma^2$ and
$Z(\beta) = \int_{y \in \cY}
\exp( \beta y^2)d y$. Because least squares is an intuitive,
mathematically wellbehaved and easy to perform procedure, it is often
assumed in Bayesian regression that the noise is normally distributed
 even in cases where in reality, it is not
\citep{Grunwald98b,KleijnvdV04}.
Completely analogously to the Gaussian case, the transformation
used in this paper maps classifiers $c$ and noise rates
$\theta$ to distributions $p_{c,\theta}$ so that the log likelihood
becomes a {\em linear function of the $0/1$error}, since it can be
written as:
\begin{equation}
\label{eq:entrop}
 \ln p_{c,\theta}(y^n \mid x^n) = \beta_{\theta}
\sum_{i=1}^n y_i  c(x_i) +
n \ln Z(\beta_{\theta}).
\end{equation}
where we wrote $\beta_\theta = \ln (1 \theta)  \ln \theta$ and $Z(\beta) =
\sum_{y \in \cY} \exp( \beta y)$ \citep{Grunwald98b,MeirM95}. Indeed, the
models $\{ p_{c,\theta} \}$ are a special case of {\em
logistic regression models}, which we now define:
\paragraph{Logistic regression interpretation}
let $\cC$ be a
set of functions $\cX \rightarrow \cY$, where $\cY \subseteq {\mathbb
R}$ ($\cY$ does not need to be binaryvalued). The corresponding
logistic regression model is the set of conditional distributions $\{
p_{c,\beta} : c \in {\cal C} ; \beta \in {\mathbb R} \}$ of the
form
\begin{equation}
\label{eq:logit}
p_{c,\beta}(y \neq x \mid x) := \frac{e^{\beta  y  c(x)}}{1 + e^{\beta}}
\end{equation}
This is the standard construction used to convert classifiers with
realvalued output such as support vector machines and neural networks
into conditional distributions \citep{Jordan95,Tipping01}, so that
Bayesian inference can be applied. By setting $\cC$ to be a set of
$\{0,1\}$valued classifiers, and substituting $\beta = \ln (1
\theta)  \ln \theta$ as in (\ref{eq:entrop}), we see that the
construction is a special case of the logistic regression transformation
(\ref{eq:logit}). It may seem that (\ref{eq:logit}) does not treat $y
= 1$ and $y= 0$ on equal footing, but this is not so: we can
alternatively define a symmetric version of (\ref{eq:logit}) by
defining, for each $c \in \cC$, a corresponding $c': \cX \rightarrow
\{1,1\}$, $c'(x) := 2c(x) 1$. Then we can set
\begin{equation}
\label{eq:logitb}
p_{c,\beta}(1 \mid x) := \frac{e^{\beta c(x)}}{e^{\beta c(x)} + e^{\beta c(x)}} \ \ ; \ \
p_{c,\beta}(1 \mid x) := \frac{e^{\beta c(x)}}{e^{\beta c(x)} + e^{\beta c(x)}}.
\end{equation}
By setting $\beta = 2 \beta'$ we see that $p_{c,\beta}$ as in
(\ref{eq:logit}) is identical to $p_{c,\beta'}$ as in
(\ref{eq:logitb}), so that the two models really coincide.
\section{Interpretation from a Bayesian perspective}
\label{sec:bayes}
We already addressed several questions concerning the relevance of our result
directly below Corollary~\ref{cor:inconsistent}. Here we provide a more indepth analysis from a Bayesian point of view.
\subsection{Bayesian Consistency}
\label{sec:bayescons}
It is wellknown that Bayesian inference is strongly consistent under very
broad conditions (\citep{Doob49,BlackwellD62}; see also \citep{Barron85}). Such Bayesian
consistency results take on a particularly strong form if the set of
distributions under consideration is {\em countable}. In our
setting we can achieve this by adopting a discrete prior satisfying
(\ref{eq:discreteprior}). In that case,
the celebrated \citep{Doob49}
consistency theorem\footnote{In particular, see Eq. (3.6) in \citep{Doob49} combined with the
remark at the end of Section 3 of Doob's paper.} says the following for our setting. Let $\cC$
be countable and suppose
$D$ is such that, for some $c^* \in \cC$ and $\theta^* \in [0,1] \cap
{\mathbb Q}$,
$p_{c^*,\theta^*}$ is equal to $p_D$, the true distribution/ mass function of $y$ given
$x$.
Then with $D$probability
1, the Bayesian posterior concentrates on $c^*$:
$ \lim_{m \rightarrow \infty}
P( c^* \mid S^m) = 1$.
Consider now the learning problem underlying
Theorem~\ref{thm:bayesinconsistent} as described in
Section~\ref{sec:inconsistencyproof}. Since $c_0$ achieves $\min_{c
\in \cC} \erateD(c)$, it follows by part 1 of
Proposition~\ref{pro:trafo} that $\min_{c,\theta} \Delta(p_{c,\theta})
= \Delta(p_{c_0,\erateD(c_0)})$. {\em If\/} $
\Delta(p_{c_0,\erateD(c_0)})$ were $0$, then by part 2 of
Proposition~\ref{pro:trafo}, Doob's theorem would apply, and we
would have $P(c_0 \mid S^m) \rightarrow
1$. Theorem~\ref{thm:bayesinconsistent} states that this does {\em
not\/} happen. It follows that the premise
$\Delta(p_{c_0,\erateD(c_0)}) = 0$ must be false. But since
$\Delta(p_{c,\theta})$ is minimized for $(c_0,\erateD(c_0))$, the
Proposition implies that for {\em no\/} $c \in \cC$ and {\em no\/}
$\theta \in [0,1] \cap {\mathbb Q}$, $p_{c,\theta}$ is equal to $p_D(\cdot\cdot)$  in
statistical terms, the model $\cP = \{ p_{c,\theta} : c \in \cC,
\theta \in [0,1] \cap {\mathbb Q} \}$ is {\em misspecified}. Thus, the result can be
interpreted in two ways:
\begin{enumerate}
\item {\em `ordinary' Bayesian inference can be inconsistent under
misspecification\/}: We exhibit a simple logistic regression model
$\cP$ and a true distribution $D$ such that, with probability 1, the
Bayesian posterior does not converge to the distribution
$p_{c_0,\erateD(c_0)} \in \cP$ that minimizes, among all $p \in
\cP$, the KLdivergence to $D$ (equivalently, $p_{c_0,\erateD(c_0)}$
minimizes the $D$expected log loss among all distributions in
$\cP$). Thus, the posterior does not converge to the optimal
$p_{c_0,\erateD(c_0)}$ even though $p_{{c}_0,\erateD(c_0)}$ has
substantial prior mass and is {\em partially\/} correct in the sense
that $c_0$, the Bayes optimal classifier relative to
$p_{{c}_0,\erateD(c_0)}$, has true error rate $\erateD(c_0)$, which
is the {\em same\/} true error rate that it would have if
$p_{{c}_0,\erateD(c_0)}$ were `true'.
\item {\em `pragmatic' Bayesian inference for classification can be
suboptimal\/}: a standard way to turn classifiers into
distributions so as to make application of Bayesian
inference possible may give rise to suboptimal performance.
\end{enumerate}
\subsection{Two types of misspecification}
$p_{c_0,\erateD(c_0)}$ can be misspecified in
{\em two different ways}. To see this, note that
$p_{c_0,\erateD(c_0)}$ expresses that
\begin{equation}
\label{eq:znoise}
y =
c_0(x) \ {\tt xor} \ z,
\end{equation}
where $z$ is a noise bit generated
independently of $x$. This statement may be wrong under distribution $D$ either because
(a) $c_0$ is not the Bayes optimal classifier according to $D$; or
(b) $c_0$ is Bayes optimal, but $z$ is dependent on $x$ under $D$.
Let us consider both of them in more detail.
\begin{description}
\item[(a) no Bayes optimal classifier in $\cC$]
The way we defined the learning problem $D$
used in the proof of Theorem~\ref{thm:bayesinconsistent}
(Section~\ref{sec:inconsistencyproof}) is an example of this case.
This type of misspecification is subtle, because if we consider the
optimal $c_0$ {\em in isolation}, ignoring the features $X_i$ which do
not influence the prediction made by $c_0$, then the conditional
distribution $P(Y=1 \mid c_0(X), \theta^*)$ becomes correct after all,
in the sense that it is identical to the true conditional probability.
That is: for all $x_0 \in \{0,1\}$, we have
$$p_{c_0,\erateD(c_0)}(Y=1 \mid X_0 = x_0) =
\Pr_{X,Y \sim D}(Y=1 \mid
c_0(X) = x_0, X_0 = x_0),$$
so $p_{c_0,\erateD(c_0)}(\cdot \mid X_0 = x_0)$ is `true'. This may
imply that the set of distributions corresponding to $\cC$ is wellspecified,
since $c_0$ only `listens' to feature $X_0$.
Yet still, misspecification occurs because for some $x \in\{0,1\}^\infty$,
$$p_{c_0,\erateD(c_0)}(Y=1 \mid X = x) {\mathbf \neq}
\Pr_{X,Y \sim D}(Y=1 \mid c_0(X) = x_0, X= x).$$
\item[(b) $\cC$ contains Bayes act, but $D$ is heteroskedastic]
It may seem that our theorem is only applicable to misspecification
of type (a). But it is easy to see that it is just as applicable to
the  arguably less serious  misspecification of type (b). Namely,
in the proof of Theorem~\ref{thm:bayesinconsistent}
(Section~\ref{sec:inconsistencyproof}), we could have equally used
the following slightly modified learning problem : step 1 and step 2
remain identical, so $c_1, c_2, \ldots$ are defined as before. The
optimal $c_0$ is now defined by modifying step 3 as follows: for an
easy example, we set $x_0 = y$. For a hard example, we set $x_0 =
1 y$ with probability $\mu/ 2 \mu'$. Then the proof of
Theorem~\ref{thm:bayesinconsistent} holds unchanged. But now $c_0$
{\em is\/} the Bayes optimal classifier relative to $D$, as is easy
to see.
\end{description}
\subsection{Why is the result interesting for a Bayesian?}
Here we answer several objections that a cautious Bayesian might have
to this work.
\renewcommand{\thesubsubsection}{\thesubsection.\arabic{subsubsection}}
\subsubsection{Bayesian inference has never been designed to work
under misspecification. So why is the result relevant?} \
\\
We would maintain that in {\em practice}, Bayesian inference is applied
{\em all the time\/} under misspecification in classification
problems \citep{Grunwald98b}. It is
very hard to avoid misspecification with Bayesian classification,
since the modeler often has no idea about the noisegenerating
process. Even though it may be known that noise is not
homoskedastic, it may be practically impossible to incorporate
all ways in which the noise may depend on $x$ into the prior.
\subsubsection{It is already wellknown that Bayesian inference
can be inconsistent even if $\cP$ is {\em wellspecified}, i.e. if it
contains $D$ \citep{DiaconisF86,Barron98}. So why is this result interesting?} \
\\ \label{sec:diaconis} The (in)famous inconsistency results by
\citep{DiaconisF86} are based on nonparametric inference with
uncountable sets $\cP$. It follows from \citep{Barron98}
that their theorems require that the true
distribution $D$ has
`extremely small' prior density in the following sense:
the prior mass of
$\epsilon$KullbackLeibler balls around $D$ is exponentially
small in $1 / \epsilon$. Since such priors do not allow
one to compress the data, from an MDL perspective it is not at all
surprising that they lead to inconsistent inference \citep{Barron98}.
In contrast, in our result, rather than small prior densities
we require misspecification. Since Diaconis and Freedman do not
require misspecification, in a
sense, our result is weaker. On the other hand, in our setting,
the prior on the $p \in \cP$ closest in KL divergence to the true
conditional distribution $p_D$ can be arbitrarily close to $1$,
whereas Diaconis and Freedman require the prior of the `true' $p_D$ to
be exponentially small in the sense explained above. In this sense,
our result is stronger than theirs.
\cite{Barron98} exhibits an example of Bayesian inconsistency that is
closer in spirit to ours. In his example, the prior {\em density\/}
of KLneighborhoods of the true $D$ can be substantial. Nevertheless,
his example requires that $\cP$ contains uncountably many
distributions. It is not possible to extend Barron's example to a case with
only countably many distributions, since in that
case, the posterior {\em must\/} concentrate\footnote{More precisely,
the posterior mass on the set of all distributions in $\cP$ that are
mutually singular with $D$ must go to $0$ with $D$probability 1.} on the true $D$ by Doob's result. Our result shows that even in the countable case, as
soon as one allows for slight misspecification, the posterior may not
converge to the best distribution in $\cP$. Indeed, by an appropriate setting of the parameters
$\mu$ and $\mu'$ it is seen from Theorem~\ref{thm:bayesinconsistent}
that for every $\epsilon > 0$, no matter
how small, we can exhibit a $D$ with
$$
\min_{c,\theta} {\rm KL}(p_D \ p_{c,\theta}) = \epsilon
$$
for which Bayes is inconsistent with
$D$probability 1. This is interesting because even under
misspecification, Bayes is consistent under fairly broad conditions
\citep{BunkeM98,KleijnvdV04}, in the sense that the posterior
concentrates on a neighborhood of the distribution that minimizes
KLdivergence to the true $D$. We showed that if such conditions are
violated, then consistency may fail dramatically. Thus, we feel our result is relevant
at least from the {\em inconsistency under misspecification\/}
interpretation.
\subsubsection{So how can the result coexist with theorems establishing
Bayesian consistency under misspecification?} \
\\ \label{sec:consistmis}
Such results are typically proved under either one of the following
two assumptions:
\begin{enumerate}
\item The set of distributions $\cP$ is `simple', for example,
finitedimensional parametric. In such cases, ML estimation is
usually also consistent  thus, for large $m$ the role of the prior
becomes negligible. In case $\cP$ corresponds to a classification
model $\cC$, this would occur if $\cC$ were finite or had finite VCdimension for example.
\item $\cP$ may be arbitrarily large or complex, but it is {\em
convex\/}: any finite mixture of elements of $\cP$ is an element
of $\cP$. An example is the family of Gaussian mixtures with an
arbitrary but finite number of components.
%peter3 added
Theorem 5.5 of \citep{Li97} shows that for general convex i.i.d. families (not just
Gaussian mixtures), under conditions on the priors, twopart MDL
(essentially the version of MDL that we consider here) is consistent
in the sense of expected KullbackLeibler risk.
Although we have no
formal proof, Li's result strongly suggests that with such priors,
the Bayesian MAP and full Bayesian approach will also be consistent.
\end{enumerate}
Our setup violates both conditions: $\cC$ has infinite VCdimension,
and the corresponding $\cP$ is not closed under taking mixtures. The latter issue is discussed further in Example~\ref{ex:mix}.
\subsubsection{How `standard' is the conversion from classifiers to
probability distributions on which the results are based?}\label{sec:bayestest} \ \\ One may argue that the notion of `converting'
classifiers into probability distributions is not always what
Bayesians do in practice. For classifiers which produce {\em
realvalued} output, such as neural networks and support vector
machines, the transformation coincides with the logistic regression
transformation, which is a standard Bayesian tool; see for example
\citep{Jordan95,Platt99,Tipping01}. But the theorems are based on
classifiers with $0/1$output. With the exception of decision trees,
such classifiers have not been addressed frequently in the Bayesian
literature. Decision trees have usually been converted to conditional
distributions somewhat differently: one uses the same logistic
transformation as we do, but one assumes a different noise rate {\em
in each leaf\/} of the decision tree \citep{HeckermanCMRK00}; thus,
the transformation is done locally for each leaf rather than globally
for the whole hypothesis. Since the noise rate can depend on the
leaf, the set of all decision trees of arbitrarily length
on a given input space $\cX$ coincides with the set of all conditional
distributions on $\cX$. Thus it avoids the misspecification, and
therefore the inconsistency problem, but at the cost of using a much
larger model space.
Thus, here is a potentially weak point in the analysis: we use a
transformation that has mostly been applied to realvalued
classifiers, whereas here the classifiers are $0/1$valued.
Nevertheless, to get an idea of how reasonable our transformation is,
we simply {\em tested\/} it with three professing Bayesians. We did this in
the following way: we first described the set of classifiers ${\cal
C}$ used in the learning problem, and we said that we would like to
perform Bayesian inference based on some prior over ${\cal C}$. We
then asked the Bayesian how (s)he would handle this problem. All three
Bayesians said that they would construct conditional distributions
according to the logistic transformation, just as we did.
We take this as evidence that the logistic
transformation is reasonable, even for classifiers with binary outputs.
Whether the inconsistency results can be extended in a natural way to
classifiers with realvalued output such as support vector machines
remains to be seen. The fact that the Bayesian model corresponding to
such neural networks will still typically be misspecified strongly
suggests (but does not prove) that similar scenarios may be
constructed.
\subsubsection{Is there an alternative, more sophisticated transformation that avoids inconsistencies?}
Even though the transformation we perform is standard, there may exist
some other method of transforming a set of classifiers$+$prior into a
set of distributions$+$prior that avoids the problems. There are only
two obvious options which suggest themselves:
\begin{description}
\item[1. Avoiding Misspecification] First, we can try to avoid
misspecification; then by the strong Bayesian consistency theorems
referred to in Question~\ref{sec:diaconis}, we should be guaranteed
to converge to the optimal classifier. However, as we explain
below, this is often not practical.
\item[2. Ensuring $\cP$ is Convex] Second, rather than using the set
of transformed classifiers $\cP$, we could put a prior on its convex
closure $\overline{\cP}$ (this is the set of all finite and infinite
mixture distributions that can be formed from elements of $\cP$.
Note in particular that $\cP$ and $\overline{\cP}$ are sets of
distributions defined on one outcome, not on a sample of $m > 1$
outcomes). Then, we can once again apply the consistency theorem
for convex $\cP$ referred to in Question~\ref{sec:consistmis}, and
we should be guaranteed to converge to the optimal distribution.
Computational difficulties aside, this approach will not work,
because now the distribution we converge to may not be the
distribution we are interested in, as we describe further below.
\end{description}
Thus, the only two straightforward solutions to the transformation
problem are either impractical or do not work. We discuss both of
these in detail below. There may of course exist some clever
alternative method that avoids all problems, but we have no idea how
it would look like.
\paragraph{1. Can we ensure consistency by avoiding misspecification?}
>From a subjective Bayesian perspective, one might require the learning
agent to think hard enough about his or her prior probabilities so
that the set of conditional distribution $\cP$ does contain $D$, the
true state of nature. In practice this means that one should ensure
that $\cC$ contains the Bayes optimal classifier with respect to $D$,
and that $\cP$ should contain distributions in which the noise $z$
(Equation (\ref{eq:znoise})) can depend on the feature value $x$. In
practical machine learning applications one will often have no idea
how the Bayes optimal classifier behaves or how the noise
depends on $x$. Thus, the only way to proceed seems to design a prior
on {\em all\/} possible classifiers and {\em all\/} possible noise
rate functions. Now the inconsistency problem is solved, because the
(`nonparametric') model thus constructed is guaranteed to contain the
true (conditionalized) distribution $D$, so common Bayesian
consistency theorems (see above) apply. However, the cost may be
enormous: the model space is now {\em much\/} larger and it seems that a
lot more data may be needed before a reasonable approximation of $D$
is learned  although interestingly, recent work by M.
\cite{Hutter05} suggests that under suitable priors, reasonable approximations
may be learned quite fast. It is not clear whether or not something
like this can be done in our context.
% (Nevertheless, such a
%`nonparametric Bayes' approach is sometimes successfully applied in
%practice  it can be argued that Gaussian process models for
%classification \cite{} try to achieve just this).
\paragraph{2. Can we ensure consistency by using convex models?}
Suppose we first use the logistic transformation to transform the
classifiers $\cC$ into a set of conditional distributions $\cP$, and we then
put a prior on its convex
closure $\overline{\cP}$ and use Bayesian inference based on
$\overline{\cP}$. Now , Li's result (Section~\ref{sec:consistmis})
suggests that the Bayesian posterior predictive distribution is
(under weak conditions on the prior) guaranteed to converge to the closest distribution $p^*$ to $D$ within $\overline{\cP}$, as measured in KLdivergence. However,
as the following example shows, $p^*$ may end up having {\em larger\/}
generalization error (expected $0/1$loss) than the optimal classifier
$c^*$ in the set $\cC$ on which $\cP$ was based. Thus, existing
theorems suggest that
with a prior on $\overline{\cP}$, the Bayesian posterior will
{\em converge}, but below we show that if it does converge, then it
will sometimes converge to a distribution that is
suboptimal in the performance measure we are interested in.
\begin{example}[classification error and taking mixtures]
\label{ex:mix}
\rm %peter1 this example shows that there is NO EASY FIX
% to making the transformation
% between classifiers and distributions work.
% we can perhaps leave out some of the details to shorten it; but i would really
% like to keep it in. Also because we should have some extra meat
% in the journal paper as compared to the conference paper!
We consider the following learning problem. There are three
classifiers $\cC = \{c_1, c_2, c_3\}$ and three features $X_1, X_2,
X_3$ taking values in $\{0,1\}$. Each classifier simply outputs the
value of the corresponding feature. The underlying distribution $D$
is constructed as follows. We distinguish between three `situations'
$s_1, s_2, s_3$ (these are the values of some random variable $S'
\sim D$ that is not observed). To construct an example $(x,y)$, we
first flip a fair coin to determine $y$, so $y =1$ with probability
$1/2$. We then flip a fair threesided coin to determine what
situation we are in, so $S'= s_j$ with probability $1/3$, for $j \in
\{1,2,3\}$. Now if we happen to be in situation $s_j$, we
\begin{enumerate}
\item Set $x_j = y$ (so $c_j$ will predict $Y$ correctly).
\item Flip a fair coin, determine the outcome $z \in \{0,1\}$, and set $x_{j'} = z$ for the two values of $j' \in \{1,2,3\}$ that are not equal to $j$.
\end{enumerate}
Thus, the value of $x_{j'}$ is determined completely at random, but must be the same for both features not equal to $j$. We thus have for $j = 1,2,3$:
\begin{eqnarray}
\label{eq:snowy}
\erateD(c_j) & = &
\frac{1}{3} \cdot 0 + \frac{2}{3} \cdot \frac{1}{2} = \frac{1}{3}, \\
\label{eq:boxz}
\KL(p_D \ p_{c_j,\erateD(c)}) & = &
E_{(x,y) \sim D} [  \log p_{c_j,\erateD(c_j)}(yx) + \log p_{D}(yx)] =
\nonumber \\ & &
H(\erateD(c_j))  K_D =
H(\frac{1}{3})  K_D
%= \log 3  \frac{2}{3}  K_D
> .9  K_D.
\end{eqnarray}
Equation (\ref{eq:boxz}) follows by (\ref{eq:turnaround}), and as in that equation, $H$ is the binary entropy as defined above
Theorem~\ref{thm:bayesinconsistent}, and $K_D$ is the conditional
entropy of $y$ given $x$ according to $D$, which does not depend on $j$.
Thus, the distribution(s) in $\cP := \{ p_{c_j,\theta} \mid j \in
\{1,2,3\}, \theta \in [0,1 ] \}$ closest to the underlying $D$ in
KLdivergence have KL divergence $H(\frac{1}{3})  K_D$ to $D$.
Now consider the set of conditional distributions $\overline{\cP}$
defined as the convex closure of $\cP$. It is easy to see that each element of
$\overline{\cP}$ can be written as a threecomponent mixture
$$
\overline{p}_{\vec{\alpha},\vec{\theta}} := \alpha_1 p_{c_1,\theta_1} + \alpha_2 p_{c_2,\theta_2} + \alpha_3 p_(c_3,\theta_3),
$$
for some vector $\vec{\alpha} = (\alpha_1, \alpha_2,\alpha_3)$ with
nonnegative entries summing to $1$, and $(\theta_1,
\theta_2,\theta_3)$ with nonnegative entries $\leq 1$. Thus, the
distribution in $\overline{P}$ that is closest to $D$ in KL divergence
is the distribution
%$\overline{p}^* :=
%\overline{p}_{\vec{\alpha},\vec{\theta}}$
that achieves the minimum
over $\vec{\alpha}$ and $\vec{\theta}$ of the expression
\begin{equation}
\label{eq:boxy}
\KL(p_D \ p_{\vec{\alpha},\vec{\theta}}) =
E_{(x,y) \sim D} [\log \frac{1}{p_{\vec{\alpha},\vec{\theta}}(yx)}]
 K_D.
\end{equation}
\commentout{
%peter1 probably we don't want this in explicitly. Too long
\begin{multline}
= \\
\frac{1}{3}\left[
\frac{1}{2} \log
\frac{1}{\alpha_1 (1 \theta_1) + \alpha_2 (1 \theta_2) + \alpha_3 (1 \theta_3)}
+ \frac{1}{2} \log
\frac{1}{\alpha_1 (1 \theta_1) + \alpha_2 (\theta_2) + \alpha_3 (\theta_3)}
\right] + \\
\frac{1}{3}\left[
\frac{1}{2} \log
\frac{1}{\alpha_2 (1 \theta_2) + \alpha_1 (1 \theta_1) + \alpha_3 (1 \theta_3)}
+ \frac{1}{2} \log
\frac{1}{\alpha_2 (1 \theta_2) + \alpha_1 (\theta_1) + \alpha_3 (\theta_3)}
\right] + \\
\frac{1}{3}\left[
\frac{1}{2} \log
\frac{1}{\alpha_3 (1 \theta_3) + \alpha_2 (1 \theta_2) + \alpha_1 (1 \theta_1)}
+ \frac{1}{2} \log
\frac{1}{\alpha_3 (1 \theta_3) + \alpha_2 (\theta_2) + \alpha_1 (\theta_1)}
\right] \\
 K_D.
\end{multline}
where we split out the sum in the expectation over the three possible
situations $s_1,s_2$ and $s_3$. }
This expression is uniquely
minimized for some $\overline{p}^*$ with parameters
\begin{equation}
\label{eq:frutsel}
\alpha^*_1 = \alpha^*_2 = \alpha^*_3 = \frac{1}{3}
\text{\ and some $\theta^* \in [0,1]$ with \ }
\theta^* =
\theta_1 = \theta_2 = \theta_3.
\end{equation}
To see this, note that by symmetry of
the problem, $ \KL(p_D \ p_{\vec{\alpha},\vec{\theta}}) = \KL(p_D \
p_{\vec{\alpha}',\vec{\theta}'})$ where
$\vec{\alpha}' := (\alpha_2,\alpha_1,\alpha_3)$ and
%\text{\ and \ }
$\vec{\theta}' := (\theta_2, \theta_1,\theta_3)
$.
Since $\overline{\cP}$ is closed under mixing, for any $\gamma \in
[0,1]$, $p_\gamma := \gamma p_{\vec{\alpha},\vec{\theta}} + (1
\gamma) p_{\vec{\alpha}',\vec{\theta}'}$ must be in $\overline{\cP}$.
By strict convexity of KL divergence \citep{CoverT91} and symmetry of the
problem, $\KL(p_D \ p_{\gamma})$ is uniquely minimized for $\gamma =
1/2$, and then $p_{\gamma}$ satisfies $\alpha_1 = \alpha_2$ and
$\theta_1 = \theta_2$. In the same way one shows that the minimizing
$\vec{\alpha}$ and $\vec{\theta}$ have to satisfy $\alpha_2 =
\alpha_3$, $\theta_2 = \theta_3$ and $\alpha_1 = \alpha_3, \theta_1 =
\theta_3$, and (\ref{eq:frutsel}) follows. Now plugging the
minimizing parameters (\ref{eq:frutsel}) into (\ref{eq:boxy}) gives
\begin{multline}
\KL(p_D \ \overline{p}^*) =
\min_{\theta \in [0,1]}
\KL(p_D \ p_{\vec{\alpha}^*,\vec{\theta}^*}) = \\
\min_{\theta}
\frac{1}{2} \left [\log (1  \theta) + \log (1 +\theta)  \log 3 \right]
 K_D = \frac{1}{2} \log 3  K_D < .8  K_D,
\end{multline}
which is strictly smaller than (\ref{eq:boxz}). Therefore, while (a)
Li's consistency result (Section~\ref{sec:consistmis}) for convex
$\overline{P}$ suggests that both the
Bayesian posterior and Bayesian MAP conditional distribution will
converge (in expected KLdivergence), to
$\overline{p}^*$, it turns out that, (b)
the classification error rate of the
Bayes classifier $c_{p^*}$ corresponding to the resulting
conditional distribution $\overline{p}^*$ is equal to
$$
E_{x,y \sim D}[y  c_{p^*}(x) ]=
\left[ \frac{1}{2} \cdot 1 + \frac{1}{2} \cdot 0 \right] = \frac{1}{2},
$$
which is {\em worse\/} than the optimal classification error rate
that can be obtained within $\overline{\cP}$: since $\cP \subset
\overline{\cP}$, by (\ref{eq:snowy}) this error rate must be $\leq
1/3$.
Concluding, with $D$probability 1, for large $m$, the error rate of
the Bayes classifier based on the Bayesian posterior relative to
$\overline{\cP}$ will have classification error that is {\em larger\/}
than that of the Bayesian posterior relative to ${\cP}$: it
is clear that by enlarging the model $\cP$ to its convex closure,
rather than sometimes not converging at all, we may now converge to a
suboptimal distribution: instead of solving the problem, we merely
replaced it by another one.
\end{example}
\subsubsection{Isn 't the example just ''unnatural''?}
Upon hearing our results, several people objected that our learning
problem is ``unnatural''. We agree that it is unlikely that one will
ever deal with such a scenario in practice. However, this does not
rule out the possibility that related phenomena do occur in more
practical settings; see \citep{Clarke04} for an example in a regression
context. Part of the problem here is of course that it is not really
clear what ``unnatural'' means. Indeed, it is certainly not our aim to
show that ``Bayesian inference is bad''. Instead, one of our main messages is that more research is needed to determine under what types of
misspecification Bayes performs well, and under what types it does
not.
\section{Interpretation from an MDL Perspective}
\label{sec:mdl}
We now discuss the interpretation of our result from an MDL
Perspective. Similar to the Bayesian analysis, we do this by answering
objections that a cautious description length minimizer might have to
this work.
\addtocounter{subsection}{1}
\addtocounter{subsubsection}{6}
\subsubsection{Why is the twopart code (\ref{eq:cmdl}) the appropriate formula to work with? Shouldn't we use more advanced versions of MDL based on onepart codes?}
Equation (\ref{eq:cmdl}) has been used for classification by various
authors; see, e.g., \citep{Rissanen89,QuinlanR89} and
\citep{KearnsMNR97}. \citep[Chapter 5]{Grunwald98b} first noted that
in this form, by using Stirling's approximation, (\ref{eq:cmdl}) is
essentially equivalent to MAP classification based on the models
$p_{c,\theta}$ as defined in Section~\ref{sec:algorithms}. Of course,
there exist more refined versions of MDL based on onepart rather than
twopart codes \citep{BarronRY98}. To apply these to classification,
one somehow has to map classifiers to probability distributions
explicitly. This was already anticipated by \cite{MeirM95} who used
the transformation described in this paper to define onepart MDL
codes. The resulting approach is closely related to the Bayesian
posterior approach $\cbayes$, suggesting that a version of the
inconsistency Theorem~\ref{thm:bayesinconsistent} still applies.
\cite{Rissanen89} considered mapping classifiers $\cC$ to
distributions $\{ p_{c,\theta^*} \}$ to a {\em single\/} value of
$\theta^*$, e.g., $\theta^* = 1/3$. As discussed in
Section~\ref{sec:technical}, a version of
Theorem~\ref{thm:bayesinconsistent} still applies to the resulting
distributions.
We should note that both \cite{WallaceP93} and \cite{QuinlanR89}
really use an extension of the coding scheme expressed by
(\ref{eq:cmdl}), rather than the exact formula (\ref{eq:cmdl}) itself: both
publications deal with decision trees, and apply (\ref{eq:cmdl}) on
the level of the leaf nodes of the decision trees. The actual
codelength for the data given a decision tree becomes a sum of
expressions of the form (\ref{eq:cmdl}), one for each leaf. This
means that they are effectively estimating error rates separately for
each leaf. Since their model consists of the set of all decision trees
of arbitrary depth, they can thus essentially model almost any conditional
distribution of $Y$ given $X$. This makes their approach
nonparametric, and therefore, broadly speaking, immune to
misspecification as long as data are i.i.d., and therefore immune to
our results: inconsistency can only arise if the coding scheme
(\ref{eq:cmdl}) is applied to a model that can only present
homoskedasticity, whereas the data generating distribution is
heteroskedastic. It is not clear though whether the use of
nonparametric models such as decision trees always solves the problem
in {\em practice}, as we already discussed in Section 6.3.5., Question
1. As a further (but inessential) difference, \cite{QuinlanR89} use
one extra bit on top of (\ref{eq:cmdl}) for each leaf node of the
decision tree. \cite{WallaceP93} point out that this is unnecessary,
and use more general codes based on Betapriors, of which our code
(\ref{eq:cmdl}) is a special case, obtained with the uniform prior
(see Proposition~\ref{prop:uniboem} in the Appendix). As can be seen
from the proof of Theorem 2, the use of general Betapriors in the
definition of MDL will not affect the inconsistency result.
\subsubsection{Does the coding scheme for hypotheses make sense from an MDL perspective?}
MDL theory prescribes the design of codes for hypothesis spaces
(roughly corresponding to priors) that minimize worstcase regret or
redundancy \citep{BarronRY98,Grunwald07} of the resulting codelength of
hypothesis $+$ data. It may seem that our coding scheme for hypotheses
does not satisfy this prescription. But in fact, it does: if no
natural grouping of the hypotheses in subclasses exists (such as with
Markov chains, the class of $k$th order Markov chains being a natural
subclass of the class of $k+1$st order chains), then the `best', from
an MDL perspective, code one can assign is a code such that the code
length of $c_i$ goes to infinity as slowly as possible with increasing
index $i$ \citep{Grunwald07}, such as Rissanen's universal code for the
integers (Equation~\ref{eq:priora}). But this is exactly the type of
codes to which our Theorem~\ref{thm:bayesinconsistent} applies!
Lest the reader disagree with this: according to `standard' MDL
theory, if $\cP$ is wellspecified and countable then the coding
scheme should even be asymptotically irrelevant: {\em any\/} coding
scheme for the hypothesis where the codelength of any $P \in \cP$ does
not depend on $n$, will lead to asymptotically consistent MDL
inference under very weak conditions \citep{BarronC91};
see also Chapter 5, Theorem 5.1 of \citep{Grunwald07}. Special
types of codes minimizing worstcase regret are only needed to {\em
speed up\/} up learning with small samples; for large samples, any
code will do. Thus, our result shows that if a set of classifiers
$\cC$ is used (corresponding to a misspecified probability model
$\cP$), then the choice of prior becomes of crucial importance, even
with an infinite amount of data.
\subsubsection{It seems that MDL can already be inconsistent even if ${\cal P}$ is wellspecified. So why is the result interesting?}
This question mirrors Question 6.3.2. In Section 1.2 of
\citep{WallaceD99b}, a very simple problem is discussed for which a
straightforward implementation of a twopart code estimator behaves
quite badly, even though the true distribution is contained in the
model ${\cal P}$, and ${\cal P}$ only contains 1 continuousvalued
parameter. This suggests that MDL may be inconsistent in a setting
that is much simpler than the one we discuss here. But this is not
quite the case: if the true distribution is contained in
${\cal P}$, then {\em any\/} twopart code will be asymptotically
consistent, as long as the code is `universal'; see Theorem 15.3 in Chapter 15
of \citep{Grunwald07}. Under the definition of MDL that has generally
been adopted since \cite{BarronRY98}, an estimator based on a twopart
code can only be called `MDL estimator' if the code is universal. Thus, it may either be the case that the
twopart code defined by \cite{WallaceD99b} is not universal, and
hence not an MDL code, or their twopart code must be asymptotically
consistent after all. We suspect that the latter is the case. From an
MDL perspective, the interest in our example is that, under
misspecification, we can get inconsistency, even though we do
use a universal twopart code.
\subsubsection{Haven't \cite{KearnsMNR97} already shown that MDL is no good for classification?}
It may seem that the results are in line with the investigation of
\cite{KearnsMNR97}. This, however, is not clear  Kearns et al.
consider a scenario in which twopart code MDL for classification
shows quite bad experimental performance for large (but not infinite!)
sample sizes. However, according to \cite{ViswanathanWDK99}, this is
caused by the coding method used to encode hypotheses. This method
does not take into account the precision of parameters involved
(whereas taking the precision into account is a crucial aspect of
MDL!). In the paper \citep{ViswanathanWDK99}, a different
coding scheme is proposed. With this coding scheme, MML (an inference method that is related to MDL, see below) apparently behaves quite well on the
classification problem studied by Kearns et al. In contrast to
Kearns' example, in our case (a) there is no straightforward way to
improve the coding scheme; (b) MDL fails even on an infinite sample.
\subsubsection{What about MML?}
The {\em Minimum Message Length (MML) Principle\/}
\citep{WallaceB68,ComleyD05,Wallace05} is a method for inductive
inference that is both Bayesian and compressionbased. The similarities and differences with MDL are subtle; see, for example, Section 10.2 of \citep{Wallace05} or Section 17.4 of \citep{Grunwald07}, or \citep{WallaceD99a,WallaceD99b}. An anonymous
referee raised the possibility that MML may be consistent for the
combination of the learning problem and the misspecified probability
model discussed in this paper. We suspect that this is not the case,
but we are not sure of this, and for the time being, the question of
whether or not MML can be inconsistent under misspecification in
classification contexts remains open. For the wellspecified case, it
is conjectured on page 282 of \citep{WallaceD99a} that only MML or
closely related techniques can infer fullyspecified models with both
statistical consistency and invariance under onetoone
parameterization.
\paragraph{Related Work}
\cite{Yamanishi98} and \cite{Barron91} proposed
modifications of the twopart MDL coding scheme so that it would be
applicable for inference with respect to general classes of predictors
and loss functions, including classification with $0/1$loss as a
special case. Both Yamanishi and Barron prove the consistency (and
give rates of convergence) for their procedures. Similarly,
McAllester's (1999) PACBayesian method can be viewed as
a modification of Bayesian inference that is provably consistent for
classification, based on sophisticated extensions of the Occam's Razor
bound,
Theorem~\ref{thm:orb}.
%Both Yamanishi's and
%Barron's schemes are provably consistent; (very roughly speaking they
%modify the twopart code criterion by multiplying the description
%length $L_1(c)$ by a factor of order $O(\sqrt{m})$ or $O(\sqrt{m}/
%\log m)$.
These modifications anticipate our result, since it must have been
clear to the authors that without the modification, MDL (and discrete
Bayesian MAP) are not consistent for classification. Nevertheless, we
seem to be the first to have explicitly formalized and proved this.
\section{Conclusion and Future Work}
We showed that some standard versions of MDL and Bayesian inference
can be inconsistent for a simple classification problem, and we
extensively discussed the interpretation of this result. As possible
future work, it would be interesting to investigate
\begin{enumerate}
\item Whether there is a more natural learning problem, especially a
more natural feature space, with respect to which an analogue to our
result still holds.
\item Whether a similar result holds for regression rather than
classification problems. We conjecture that the answer is {\em yes,
but the suboptimality will be less dramatic}.
\end{enumerate}
\section{Acknowledgments}
The ideas in this paper were developed in part during the workshop
{\em Complexity and Inference}, held at the DIMACS center, Rutgers
University, June 2003. We would like to thank Mark Hansen, Paul
Vit\'anyi and Bin Yu for organizing this workshop, and Dean Foster and
Abraham Wyner for stimulating conversations during the workshop.
%peter3 added
We would especially like to thank Marcus Hutter who pointed out an
error in a previous version of Theorem 2.
This work was supported in part by the IST Programme of the European
Community, under the PASCAL Network of Excellence,
IST2002506778. This publication only reflects the authors' views.
%\bibliographystyle{plain}
\bibliography{master,MDL,peter}
\appendix
\section*{Appendix: Proposition~\ref{prop:uniboem} and its Proof}
\begin{proposition}
\label{prop:uniboem}
Consider any given sample $S$ of arbitrary size $m$.
\begin{enumerate}
\item Let $c \in \cC$ be an arbitrary classifier and let
$P(\thetac)$ be given by the
uniform prior with $P(\theta \mid c) \equiv 1$.
Then
\begin{multline}
\label{eq:uniboem}
 \log P(y^m \mid x^m, c) =  \log \int_0^1 P(y^m \mid x^m, c, \theta) d \theta
= \\ \log (m+1) + \log \binom{m}{m\hat{e}_S(m)}.
\end{multline}
so that, if the uniform prior is used, then $\cmdl = \csmapb$.
\item Suppose that $P(\theta \mid c)$ satisfies (\ref{eq:priorb}) or (\ref{eq:discreteprior}), and that for some $\alpha > 0$, $\hat{e}_S(c) < 0.5  \alpha$. Then
\begin{multline}
\label{eq:bayesbound}
m H(\hat{e}_S(c)) = \log \frac{1}{P(y^m \mid x^m, c, \hat{e}_S(c))}
\leq \log \frac{1}{P(y^m \mid x^m, c)}
\leq \\
\log \frac{1}{P(y^m \mid x^m, c, \hat{e}_S(c))} +
f_{\alpha}(m) =
%$ \ \\
%\ \ \ \ \ \ \ \ \ \ $\text{\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ }
m H(\hat{e}_S(c)) + f_{\alpha}(m),
%$ \
\end{multline}
where $f_{\alpha}(m) = O(\log m)$, and the constant in the $O$term
may depend on $\alpha$.
\end{enumerate}
\end{proposition}
\begin{proof}
We recognize the integral in (\ref{eq:uniboem}) as being a
betaintegral. Straightforward evaluation of the integral (e.g. by
partial integration) gives the result of part (1).
For part (2), the leftmost and rightmost equalities follow by straightforward
rewriting. The first inequality follows because
\begin{multline}
\log \frac{1}{P(y^m \mid x^m, c)} =
\log \frac{1}{\int P(y^m \mid x^m, c,\theta) P(\theta) d\theta}
\geq
\log \frac{1}{P(y^m \mid x^m, c, \hat{e}_S(c))}, \nonumber
\end{multline}
since the likelihood $P(y^m \mid x^m, c, \theta)$ is maximized
at $\theta = \hat{e}_S(c)$.
For the second inequality, we first consider the case that
$P(\thetac)$ satisfies (\ref{eq:priorb}). Then using (\ref{eq:uniboem}),
\begin{multline}
\log \frac{1}{P(y^m \mid x^m, c)} \leq
 \log \int_0^{0.5} P(y^m \mid x^m, c, \theta) d \theta  \log \gamma
\leq \\  \log
\int_0^{1} P(y^m \mid x^m, c, \theta) d \theta
+ \log \frac{\int_0^{1} P(y^m \mid x^m, c, \theta) d \theta}{
\int_0^{0.5} P(y^m \mid x^m, c, \theta) d \theta}  \log \gamma = \\
\log (m+1) + \log
\binom{m}{m\hat{e}_S(m)}  \log \gamma + o(1),
\end{multline}
where the constant in the $o(1)$ depends on $\alpha$.
The result for $P(\theta)$ satisfying (\ref{eq:priorb}) now follows
upon noting that for all $s \in \{0, 1, \ldots, m \}$, $m H(s/m) \geq
\log \binom{m}{s}$. This is the case because $m H(s/m)$ is the number
of bits needed to encode $m$ outcomes with $s$ ones, using a Bernoulli
distribution with parameter $s/m$; whereas $\log \binom{m}{s}$ is the
number of bits needed to encode $m$ outcomes with $s$ ones, using a
Bernoulli distribution with parameter $s/m$, conditioned on the
relative frequency of $1$s being $s/m$  thus, the same sequence is
encoded using the same code, but conditioned on extra information, so
that equally many or less bits are needed.
Now consider the case that $P(\theta c)$ satisfies (\ref{eq:discreteprior}). Then \begin{multline}
P(y^m \mid x^m, c) = \sum_{\theta \in [0,1] \cap {\mathbb Q}}
P(y^m \mid x^m, c, \theta) P(\theta  c) \geq \\
P(y^m \mid x^m, c, \hat{e}_S(c) )P(\hat{e}_S(c) = \theta  c) \geq
P(y^m \mid x^m, c, \hat{e}_S(c) ) K_1 m^{K_2},
\end{multline}
for some constants $K_1$ and $K_2$. The result now follows by taking negative
logarithms.
\end{proof}
\end{document}