\documentclass{llncs}
\usepackage{llncsdoc,cite}
\usepackage{latexsym,amssymb,amsmath,eepic,graphicx,float,floatflt,parskip}
%\usepackage{sectsty}
\newcommand{\shrink}[1]{}
\newcommand{\ignore}[1]{\relax}
\makeatletter
\newcommand{\setfloattype}[1]{\renewcommand{\@captype}{#1}}
\makeatother
%
\newcommand{\f}{f}
\newcommand{\g}{g}
\newcommand{\auc}{\mbox{\textsc{auc}}}
\newcommand{\fas}{\mbox{\textsc{fas}}}
\renewcommand{\d}{d} % {\operatorname{deg}}
\newcommand{\TT}{\mathbf{T}}
\newcommand{\DD}{\mathbf{D}}
\newcommand{\HH}{\mathbf{H}}
\newcommand{\II}{\mathbf{I}}
\newcommand{\DDstar}{\mathbf{D}^\star}
\newcommand{\one}{\mathbf{1}}
\newcommand{\ra}{\rightarrow}
\newcommand{\suml}{\sum_{\ell \in L}}
\newcommand{\sumw}{\sum_{w \in W}}
\newcommand{\seq}[1]{\langle {#1} \rangle}
\renewcommand{\aa}{i}
% \newcommand{\bb}{{i+1}}
\newcommand{\bb}{{j}}
\newcommand{\xx}{'}
\newcommand{\cc}{_{j+1}}
\newcommand{\oo}{_{i-1}}
\newcommand{\da}{d_\aa}
\newcommand{\db}{d_\bb}
\newcommand{\dx}{d\xx}
\newcommand{\dc}{d\cc}
\newcommand{\doo}{d\oo}
\newcommand{\la}{l_\aa}
\newcommand{\lb}{l_\bb}
\newcommand{\lx}{l\xx}
\newcommand{\lc}{l\cc}
\newcommand{\lo}{l\oo}
\newcommand{\wa}{w_\aa}
\newcommand{\wb}{w_\bb}
\newcommand{\wx}{w\xx}
\newcommand{\wc}{w\cc}
\newcommand{\wo}{w\oo}
\newcommand{\xstar}{x^{\star}}
\newcommand{\rank}{\operatorname{rank}}
\newcommand{\myhead}[1]{\subsection*{#1}}
%
\renewcommand{\a}{\alpha}
\newcommand{\e}{\epsilon}
\newcommand{\E}{\mathbb{E}}
\newcommand{\U}{U}
\newcommand{\N}{\mathbb{N}}
\renewcommand{\Re}{\mathbb{R}}
\newcommand{\isom}{\equiv}
\newcommand{\cond}{\; | \;}
\renewcommand{\b}{\beta}
\newcommand{\I}{\mathbf{1}}
%\newcommand{\one}[1]{\mathbf{1}\left({#1}\right)}
\newcommand{\prob}{\mathbb P}
\def\l{\lambda}
\renewcommand{\k}{\ensuremath{\kappa}}
%\newcommand{\g}{\ensuremath{\gamma}}
\newcommand{\ie}{i.e.}
\newcommand{\eqdef}{\doteq}
\renewcommand{\O}[1]{O\left(#1\right)}
\renewcommand{\deg}{\operatorname{deg}}
\newcommand{\opt}{\operatorname{opt}}
\newcommand{\back}{\operatorname{back}}
\newcommand{\sign}{\operatorname{sign}}
\newcommand{\out}{\operatorname{outd}}
\newcommand{\indeg}{\operatorname{ind}}
\newcommand{\To}{{T_0}}
\newcommand{\Tf}{{T_c}}
\newcommand{\Do}{\deg_0}
\newcommand{\Df}{\deg_c}
\newcommand{\set}[1]{\{ {#1} \}}
\newcommand{\sumil}{\sum_{i \in L}}
\newcommand{\sumjw}{\sum_{j \in W}}
\newcommand{\summ}{\sumil \sumjw}
\newcommand{\summi}{\sum_{i=1}^{|L|}}
\newcommand{\summj}{\sum_{j=1}^{|W|}}
\newcommand{\summm}{\summi \summj}
\newcommand{\ca}{c^A_s}
\newcommand{\cb}{c^B_s}
\newcommand{\half}{\text{\textonehalf}}
\newcommand{\li}{\ell_i}
\newcommand{\wj}{w_j}
% \renewcommand{\d}{d}
\def\gtt[#1>#2]{\one{{#1} \geq {#2}}}
% \def\gtt[#1>#2]{\mathbf{1} \left( {#1} , {#2} \right) }
\newcommand{\elll}{\textsf{L}}
%
\floatstyle{ruled}
\newfloat{algorithm}{tbp}{loa}
\floatname{algorithm}{Algorithm}
\newcommand{\qedblob}{\mbox{\rule[-1.5pt]{5pt}{10.5pt}}}
\newcommand{\mA}{\mbox{\small{A}}}
\newcommand{\mB}{\mbox{\small{B}}}
\newcommand{\mC}{\mbox{\small{C}}}
\newcommand{\cX}{{X}}
\newcommand{\Q}{Q}
\newcommand{\ord}{o}
%\renewenvironment{proof}{\noindent{\bf Proof:~~}}{\qedblob}
\pagestyle{empty}
\title{Robust Reductions from Ranking to Classification}
\author{
Maria-Florina Balcan~\inst{1} \and
Nikhil Bansal~\inst{2} \and
Alina Beygelzimer~\inst{2} \and
Don Coppersmith~\inst{3} \and
John Langford~\inst{4} \and
Gregory B. Sorkin~\inst{2}}
%\thanks{Research supported in part by EPSRC grant GR/S26323/01}
\institute{
Carnegie Melon University, Pittsburgh, PA \\
\email{ninamf@cs.cmu.edu}
\and
IBM Thomas J.~Watson Research Center, Yorktown Heights + Hawthorne, NY \\
\email{\{bansal,beygel,sorkin\}@us.ibm.com}
\and
IDA Center for Communications Research, Princeton, NJ \\
\email{dcopper@idaccr.org}
\and
Yahoo Research, New York, NY \\
\email{jl@yahoo-inc.com}
}
\begin{document}
\maketitle
\begin{abstract}
We reduce ranking, as measured by the Area Under the Receiver Operating
Characteristic Curve (AUC), to binary classification. The core
theorem shows that a binary classification regret of $r$ on the
induced binary problem implies an AUC regret of at most $2r$.
This is a large improvement over approaches such as ordering according
to regressed scores, which have a regret transform of $r \mapsto nr$
where $n$ is the number of elements.
\end{abstract}
\section{Introduction}
We consider the problem of ranking a set of instances.
In the most basic version, we are given a set
of unlabeled instances belonging to two classes, 0 and 1, and the
goal is to rank all instances from class 0 before any instance from
class 1. A common measure of success for a ranking algorithm is the area
under the ROC curve (AUC).
The associated loss, $1-\mbox{AUC}$, measures
how many pairs of neighboring instances would have to be swapped to
repair the ranking, normalized by the number of 0s times the number of 1s.
The loss is zero precisely when all 0s precede all 1s;
one when all 1s precede all 0s.
It is greater for mistakes at the beginning and the
end of an ordering, which satisfies the intuition that an unwanted
item placed at the top of a recommendation list should have a higher
associated loss than when placed in the middle. %A handy shorthand
%for understanding this loss is that it is the normalized bubble-sort
%distance between the predicted ordering and the true ordering (i.e.,
%the number of pairs in the predicted ordering when a 1 comes before a 0,
%normalized by the number of 1s times the number of 0s).
In binary classification, where the problem is simply to predict
whether a label is 0 or 1, the loss is measured by the error rate, i.e.,
the probability of a misclassification.
Here any misclassified instance incurs the same loss independently
of how other instances are classified, while the AUC loss
depends on the whole (ranked) sequence of instances.
It is natural to ask whether we need fundamentally different algorithms
to optimize these two loss functions.
%
This paper shows that the answer is no, in the sense that
the problem of optimizing the AUC can be reduced to
classification so that good performance on the classification
problem implies good performance on the AUC problem.
%We call a pair of instances \emph{mixed} if they have different labels.
The classification
problem is to predict, given a random pair of instances
with different labels,
whether the first instance should be ordered before the second.
We show that there is a robust mechanism for translating
any binary classifier learning algorithm into a ranking algorithm.
%Several observations should help understand the setting and the result better.
\noindent
{\bf Relation to Score-Based Ranking: }
A common way to generate a ranking is to
learn a \emph{scoring function} $f: X\rightarrow \mathbb{R}$,
which assigns a real-valued score to each example in the instance space $X$,
and then rank test examples in the order of their scores.
If $f$ is learned by minimizing some loss function that depends on
individual examples, this approach is not robust.
The fundamental difficulty is exhibited by highly unbalanced test sets.
If we have one 1 and many 0s, a pointwise loss on the 1 with a perfect
prediction for the 0s can greatly harm the AUC while only slightly affecting
the pointwise loss with respect to the induced distribution.
For concreteness, let $f(x)$ be 1 if the predicted label is 1, and 0 otherwise.
Then if $n$ is the number of elements in the test set,
$f$ can induce an AUC loss of 1
with classification loss of $1/n$.
Thus such schemes transform pointwise loss $l$
to AUC loss $nl$.
The same observation holds for {\it regrets} in place of losses:
pointwise regret $r$ translates into AUC regret~$nr$.
{\it Regret} here is the difference between the incurred loss and the
lowest achievable loss on the problem.
The motivation for regret analysis is to separate avoidable loss from
noise intrinsic to the problem, so that bounds apply nontrivially even
for problems with large intrinsic noise.
To avoid the robustness problem, many score-based ranking algorithms optimize
loss functions that depend on pairs of examples, e.g.,
the probability that a pair of examples with different labels is misranked
(see \cite{shivani-uniform,lugosi,freund,rudin}).
%We call a pair of instances \emph{mixed} if they have different labels.
Another approach, taken in \cite{cohen,ailon-mehryar} as well as here,
is to learn a \emph{preference function}
$c: X\times X\rightarrow \{0,1\}$ on pairs of examples.
If $c(x,x')=1$ then $c$ ranks $x$ higher than $x'$, and $c(x,x')=0$
indicates the opposite preference.
Notice that while a scoring function imposes a total order on the entire
instance space, a preference function is not required to be transitive.
If, for example, the underlying distribution is such that an optimal preference
function is non-transitive, then not imposing a total order on the entire space
may result in a better performance on limited subsets on which
the algorithm is invoked.
Since $c$ is not necessarily consistent with a linear ordering on
a test set $\U$, there is a need for a subsequent step which produces
an ordering of $\U$ based on the learned $c$.
Because $c$ is typically learned using a binary classification algorithm,
we call it a classifier.
%We want to bound the realized AUC performance (as measured
%by loss and regret) in terms of the realized classification performance.
%We will see that achieved performance
%can be robustly transferred from classification to ranking.
\noindent
{\bf Relation to the Feedback Arc Set Problem: }
Consider a set $\U$ with a hidden bipartition into a set of
\emph{winners} and a set of \emph{losers}.
Imagine that every element (also called player)
of $\U$ plays all other elements, and the outcome of each
play is determined by a classifier $c$.
%trained to predict which of the two given elements should
%be ordered first.
The tournament induced by $c$ on $\U$ does not have to be consistent
with any linear ordering.
We want to find the best way to rank the elements in $\U$ so that
all winners are ordered before all losers.
A natural objective, dating back to Slater~\cite{slater},
is to find an ordering which agrees
with the tournament on as many player pairs as possible, i.e.,
minimizes the number of inconsistent pairs where a higher-ranked
player (one ordered closer to the beginning of the list) % actually
lost to a lower-ranked player. This is the NP-hard ``minimum feedback
arc set problem in tournaments''. (Although the hardness was
conjectured for a long time, it was proved only recently; see~\cite{alon}.)
A \emph{mistake} is defined as a winner--loser pair where
the loser beats (i.e., is preferred to) the winner.
Section~\ref{fas} proves that a solution to the feedback arc set problem
satisfies a basic guarantee:
If the classifier $c$ makes at most $k$ mistakes on $\U$, then the algorithm
minimizing the number of inconsistent pairs produces an ordering,
or equivalently a transitive tournament, with at most $2k$ mistakes on $\U$.
Section~\ref{lower} shows that this bound is tight.
Instead of solving feedback arc set,
another natural way to break cycles
is to rank instances according to their number of wins in the tournament
produced by $c$.
The way ties are broken is inessential; for definiteness, let us say
they are broken against us.
Coppersmith, Fleischer, and Rudra~\cite{rudra} proved that this
algorithm provides a % (factor of)
5-approximation for the feedback arc set problem.
An approximation, however, does not generally imply
that the ratio of the number of mistakes made by the approximation to
the number of mistakes made by $c$ is finite. % (and thus doesn't
%have direct implications for the AUC problem).
For example, $c$ may make no mistakes (i.e., make
correct predictions on all winner--loser pairs)
while inducing a non-transitive tournament on the winners or the losers,
so an approximation that does not know the labeling
can incur a non-zero number of mistakes.
We prove, however, that the algorithm that orders the elements
by their number of wins, has the {same} regret
and loss transforms as an optimal solution to the NP-hard feedback
arc set problem.
Again, Section~\ref{lower} shows that solving feedback arc set does no better.
% \footnote{By the same logic, the new PTAS for feedback arc set
% \cite{Claire Kenyon-Mathieu and Warren Schudy}
% is also, surprisingly, of no use in this context.}
\noindent
{\bf Results: }
The core theorem (Theorem~\ref{thm:regret})
says that a pairwise classifier with regret $r$ % on pairs
implies AUC regret at most~$2r$,
for arbitrary distributions over instances.
For example, if the binary error rate is 20\% due to inherent noise
and 5\% due to the errors made by the classifier,
then AUC regret is at most 10\%, i.e., only the 5\% are at most doubled.
The same statement holds for losses in place of regrets.
As shown in \cite{ailon-mehryar} (see Section~\ref{lower}),
this is best possible with any deterministic algorithm.
Section~\ref{multi} proves that Theorem~\ref{thm:regret}
holds for a natural generalization of AUC regret to
multiple labels.
In a subsequent paper, Ailon and Mohri~\cite{ailon-mehryar} describe
a randomized quick-sort reduction, which guarantees that
AUC loss is bounded by binary loss,
in expectation over the randomness of the algorithm.
They also define a generalization of AUC loss to multiple
labels, and show that the expected generalized
AUC loss is bounded by twice the binary loss.
This generalization encodes the generalization
we analyze in Section~\ref{multi} as a special case.
Using the decomposition in Theorem~\ref{thm:regret}, the argument
in \cite{ailon-mehryar}
could potentially be extended to bound AUC regret, rather than loss.
The quick-sort algorithm is more efficient, requiring
only $O(n\log n)$ instead of $\Theta(n^2)$ classifier evaluations
at test time, which makes it practical in larger settings.
On the other hand, the guarantees in \cite{ailon-mehryar} are in expectation
over the algorithm's randomness. %It would be interesting
%to see a bound on the variance of the incurred loss.
%Theorem~\ref{thm:regret} reduces the learning problem to a combinatorial
%problem on tournaments described above.
Cohen, Schapire, and Singer~\cite{cohen} operate in a similar two-stage
setting: They first learn a preference function
that takes a pair of instances and returns a score predicting how certain
it is that the first instance should be ranked before the second.
The learned function is then evaluated on all pairs of instances in
the test set and the instances are ordered using the degree-based algorithm
used here.
%~\footnote{
%The approximation algorithm they use orders by the weighted sum of wins
%minus the weighted sum of loses, in the induced tournament obtained
%by eliminated instances that have already been ordered.}
%the largest possible $l_1$ agreement
%with the predictions is created, using the degree-based algorithm.
%(which orders by the weighted sum of wins
%minus the weighted sum of loses in the induced tournament obtained
%by eliminated all instances that have already been ordered).
One of the results they show is that
the agreement of an optimal feedback arc set ordering with
the learned predictions is at most twice the agreement obtained
by their algorithm.
To translate this result into the language of losses, let $\textsc{Mfa}$
be the AUC loss of the minimum feedback arc set ordering
and $\textsc{Approx}$ be the AUC loss of the approximation.
Then the result says that $1-\textsc{Approx} \geq \tfrac{1}{2}(1-\textsc{Mfa})$
or
$\textsc{Approx} \leq \tfrac{1}{2} + \textsc{Mfa}/2$.
The result is difficult to compare with the results given here,
as the settings are different.
% The settings are different, making the results given here difficult to compare.
A rough comparison requires specializations and
yields a bound that is weaker than ours:
%Applying the result in our setting requires specializations and
%yields bounds that are weaker than ours.
As we show in
Section~\ref{fas}, $\textsc{Mfa} \leq 2\,\textsc{Bin}$, where $\textsc{Bin}$
is the loss of the pairwise predictor, so the result of \cite{cohen}
roughly says that $\textsc{Approx} \leq \tfrac{1}{2} + \textsc{Bin}$ (which
is essentially vacuous because a random ordering has expected AUC loss of 1/2),
while we show that $\textsc{Approx} \leq 2\,\textsc{Bin}$, modulo the slight
difference in the binary problem.
Curiously, the relationship of ranking to classification is
functionally tighter than has been proven for regression to binary
classification ($r \mapsto \sqrt{r}$)~\cite{probing}.
%---------------------------------------------------------
\section{Preliminaries}\label{setup}
\noindent {\bf Classification: }
A \emph{binary classification problem} is
defined by a distribution $P$ over $\cX \times \{0,1\}$, where
$\cX$ is some observable feature space and $\{0,1\}$ is the binary label space.
(For simplicity of presentation, assume that the spaces are finite.)
The goal is to find a classifier
$c: \cX \rightarrow \{0,1\}$ minimizing the \emph{classification loss} on $P$
given by
$$
e(c,P) = \mathbf{Pr}_{(x,y)\sim P} [c(x) \neq y].
$$
The \emph{classification regret} of $c$ on
$P$ is defined as
\[ r(c,P) = e(c,P) - \min_{c^*} e(c^*,P),
\]
where the minimum is taken over all classifiers
$c^*: X\rightarrow \{0,1\}$.
\noindent
{\bf Ranking: }
Where $U\subseteq X$ is an unlabeled set, and $x,x'\in U$,
a \emph{preference function} $\pi(x,x',U)=1$
if $\pi$ ``prefers'' $x$ to $x'$, and 0 otherwise. For simplicity, let
$\pi(x,x',U)=1-\pi(x',x,U)$ for $x\not=x'$.
If $\pi$ is consistent with some linear ordering of $U$,
we call $\pi$ itself an \emph{ordering} of $U$, and denote
$\pi_U(x,x')=\pi(x,x',U)$.
% We say that $\pi$ is an {\it ordering} of a set $S$ if it is transitive on $S$,
% i.e., its pairwise preferences are consistent with some
% linear ordering of elements in $S$.
For a labeled set $S =\{(x_1,y_1),\ldots,(x_n,y_n)\}$,
let $U_S=\{x_1,\ldots,x_n\}$ denote the unlabeled set corresponding to $S$.
The \emph{AUC loss} of an ordering
$\pi_{U_S}$ on a set $S =\{(x_1,y_1),\ldots,(x_n,y_n)\}$ is defined as
$$
l(\pi_{U_S},S) = \frac{\sum_{i\neq j} \I(y_i > y_j)
\pi_{U_S}(x_i,x_j)}
{\sum_{i y_j)}{\sum_{ui$,
$r_{\Q}(j,i)$ is proper, which in turn implies that
$r_{\Q}(i,j)=0$. %, concluding the proof of the fact that all
%regret-zero pairwise predictions must be consistent with the ordering.
Applied repeatedly, Lemma~\ref{triangle} says that for any pair $i y_j) \pi(i,j)} {\sum_{u y_j) \pi^*(i,j)} {\sum_{u y_j) \pi(i,j) - \I(y_i > y_j) \pi^*(i,j)]} {\sum_{uy_j)c(i,j,U) - \I(y_i>y_j)c^*(i,j,U)]}{\sum_{u 0$, we have $l_{\Q}(i,k) > l_{\Q}(k,i)$ and
$r_{\Q}(i,k)$ is proper.)
\quad\qedblob
\end{proof}
\shrink{
\begin{proof}
Let $p$ be a shorthand for the restriction of $Q$ to indices $\{i, j,
k\}$, obtained by summing over all other label indices. (Thus, for
example, $p(100)$ denotes the probability that $i=1$, $j=0$, and
$k=0$, under $Q$.) A simple algebraic manipulation verifies the
claim.
\begin{align*}
r_{\Q}(i,j)&+r_{\Q}(j,k) = \\
&\frac{p(100)+p(101)-p(010)-p(011) +p(010)+p(110)-p(001)-p(101)}{N}= \\
&\frac{p(100)+p(110)-p(001)-p(011)}{N}=r_{\Q}(i,k).
\end{align*}
where $1/N = \mathbf{E}_{y^n\sim \Q} \frac{1} {\sum_{u0$ (else Case~1a applies to $i,i+1$)
and for any $i\neq 1$, $w_i >0$ (else Case~1b applies to $i-1,i$).
In particular, for any $1*0$.
If $n\geq 4$ this implies immediately that Case~2 applies to the pair~$2,3$.
If $n=1$, $\DD$ is automatically canonical.
If $n=2$ and $l_2=0$ or $w_1=0$ then $\DD$ is canonical,
while if both $l_2,w_1>0$
we may apply Case~2
(since, as we first argued, $l_1,w_2>0$).
Finally, if $n=3$, we know $l_1,l_2,w_2,w_3>0$.
If $w_1 = l_3 = 0$ then $\DD$ is canonical,
and otherwise Case~2 applies.
% \end{proof}
\noindent
{\bf Transformation 1a:}\quad
In Case~1a, where
$\da$ has only winners,
change $\DD$ to a new sequence $\DD'$ by
%
replacing the pair $(\da,\wa,0)$, $(\db,\wb,\lb)$
by their ``average'': the single degree $(\dx,\wx,\lx)$,
where
\begin{align*}
\wx &= \wa+\wb , \quad
\lx = \lb, \quad
\dx = \frac{\wa \da + (\wb+\lb) \db}{\wa+\wb+\lb} .
\end{align*}
\ignore{
As in Case~0, $\DD$ still fits the conditions on a degree sequence:
the total weight on each degree remains integral,
and the (weighted) sequence is still
majorized by $\seq{0,\ldots,n-1}$. % was 1..n
The number of distinct degrees is reduced by one.
}
The stated conditions on a transformation are easily checked.
The total weight of winners is clearly preserved,
as is the total weight of losers and the total degree (out-edges).
Summing weights preserves integrality.
% The number of nonzero weights is reduced by one.
% -- unless w_j was already 0 (i.e., only w_i and l_j >0).
The number of distinct degrees is reduced by one,
and the number of nonzero weights may be decreased by one or
may remain unchanged.
The Landau majorization condition holds because
$\DD'$, as an averaging of $\DD$, is majorized by it,
and majorization is transitive.
% averaging preserves majorization.
The only non-trivial condition is the non-decrease in $\g-2\f$.
The number of loser--winner pairs where the loser outranks the winner
remains the same, so $g(\DD) = g(\DD')$.
Also, $f$ depends only on the total weight of losers (which is unchanged)
and on the average degree of losers.
This average degree would be unchanged if $\wa$ were~$0$;
since $\wa \geq 0$, the average degree may decrease.
Thus $f(\DD) \geq f(\DD')$,
and $(g-2f)(\DD) \leq (g-2f)(\DD')$, as desired.
\noindent
{\bf Transformation 1b:}\quad
Symmetrically to Transformation~1a,
obtain $\DD'$ by
replacing the pair of labeled weighted degrees $(\da,\wa,\la)$ and $(\db,0,\lb)$
with a single one $(\dx,\wx,\lx)$,
where $\wx = \wa$, $\lx = \la+\lb$, and
$\dx = [(\la+\wa) \da + \lb \db]/(\la+\wa+\lb)$.
\ignore{
replacing the pair $(\da,\wa,0)$, $(\db,\wb,\lb)$,
with a single degree $(\dx,\wx,\lx)$,
where
\begin{align*}
\wx &= \wa+\wb , \quad
\lx = \lb, \quad
\dx = \frac{(\wa) \da + (\wb+\lb) \db}{\wa+\wb+\lb} .
\end{align*}
Again, going from $\DD$ to $\DD'$, the value of $g$ is increased,
while that of $f$ is decreased, so $g-2f$ increases.
Again, the number of distinct degrees is reduced by one.
}
\noindent
{\bf Transformation 2:}\quad
Where $\wa$, $\la$, $\wb$ and $\lb$ are all nonzero,
we begin with one case, which leads to one other.
In the usual case, we transform $\DD$ to $\DD'$ by
replacing the pair $(\da,\wa,\la)$, $(\db,\wb,\lb)$ with
$(\da,\wa+x,\la-x)$, $(\db,\wb-x,\lb+x)$,
for some value of $x$ (positive or negative) to be determined.
This affects only the labeling, not the weighted degree sequence itself,
and is therefore legitimate
% as long as $x \leq \wa$, $x \geq -\la$, $x \geq -\wb$, and $x \leq \lb$.
as long as the four quantities
$\wa+x$, $\la-x$, $\wb-x$ and $\lb+x$ are all non-negative.
Defining $\Delta = (g-2f)(\DD')-(g-2f)(\DD)$,
we wish to choose $x$ to make $\Delta > 0$.
\begin{align*}
\Delta &=
\Big\{ \big[(\lb+x)(\wa+x+\wb-x)+(\la-x)(\wa+x)\big]
- \big[\lb (\wa+\wb)+\la \wa \big] \Big\}
\\ & \qquad \quad
- 2 \Big\{ \big[(\la-x) \da + (\lb+x) \db\big]
- \big[\la \da + \lb \db\big] \Big\}
\\ &=
x (\wb + \la - 2 (\db - \da) -x)
=
x (a-x),
\end{align*}
where $a = \wb+\la-2(\db-\da)$.
This is a simple quadratic expression with negative coefficient on $x^2$,
so its value increases monotonically as $x$ is varied from $0$ to $a/2$,
where the maximum is obtained.
(Note that $a$ may be negative.)
If $a=0$ then
% this transformation makes no change,
% and instead we use Transformation~3, below.
we do not use this transformation but Transformation~3, below.
Otherwise, vary $x$ from $0$ to $a/2$ stopping when $x$ reaches $a/2$
or when any of
$\wa+x$, $\la-x$, $\wb-x$ and $\lb+x$ becomes~$0$.
Call this value~$\xstar$, and use it to define the transformation.
If any of
$\wa+x$, $\la-x$, $\wb-x$ and $\lb+x$ is 0
then the number of nonzero weights is decreased
(while the number of distinct degrees is unchanged).
Otherwise, $\xstar=a/2$.
In that case, the new $\DD'$ has $a=0$
% (because another application of the same procedure would give $\xstar=0$).
(the optimal ``weight shift'' has already been performed).
With $a=0$ we apply Transformation~3,
which reduces the number of nonzero weights.
\noindent
{\bf Transformation 3:}\quad
Similar to Cases 1a and 1b, transform $\DD$ to $\DD'$ by
% averaging the degrees $\da$ and~$\db$, adding the corresponding labels weights
replacing the pair $(\da,\wa,\la)$, $(\db,\wb,\lb)$
with a single degree $(\dx,\wx,\lx)$ that is their weighted average,
\begin{align*}
\wx &= \wa+\wb , \quad
\lx = \la+\lb, \quad
\dx = \frac{(\wa+\la) \da + (\wb+\lb) \db}{\wa+\la+\wb+\lb} .
\end{align*}
This gives
\begin{align*}
\Delta & =
(g-2f)(\DD')-(g-2f)(\DD)
\\ &=
(\la \wb) - 2 ( \la \dx + \lb \dx - \la \da - \lb \db )
\\ &=
\la \wb +
\frac
{2(\db-\da)(\wa \lb - \wb \la)}
{\wa+\la+\wb+\lb} .
\end{align*}
We apply this transformation only in the case where
Transformation~2 fails to give any improvement
because its ``$a$'' expression
is equal to~0, i.e., $\db-\da=(\wb+\la)/2$.
% (More detail on the rules for applying the transformations follows, below.)
Making the corresponding substitution gives
\begin{align*}
% (\wa+\la+\wb+\lb) \cdot \Delta
% &=
% \la \wb (\wa+\la+\wb+\lb) +
% {(\wa+\lb)(\wa \lb - \wb \la)}
% \\ &=
\Delta
&=
\la \wb +
\frac
{(\wb+\la)(\wa \lb - \wb \la)}
{\wa+\la+\wb+\lb}
\\ &=
\frac
{ (\la \wb)(\lb+\wa) + (\lb \wa)(\la+\wb) }
{\wa+\la+\wb+\lb}
> 0.
\end{align*}
This reduces the number of distinct degrees by one,
without increasing the number of nonzero weights.
Concluding the argument, we have shown that any non-canonical configuration
$\DD$ can be replaced by a configuration with
% fewer nonzero weights and
a strictly smaller total of distinct degrees and nonzero weights,
and at least as large a value of $\g-2\f$.
% Since in $\DD$ each total weight $w_i+\ell_i$ is integral,
% there were at most $n+2$ nonzero weights originally
% (the $+2$ coming from our special highest and lowest degrees),
% and a canonical configuration $\DDstar$ is reached after at most $n-1$
% transformations.
Since $\DD$ had at most
$n$ distinct degrees and $2n$ nonzero weights originally,
a canonical configuration $\DDstar$ is reached after at most $3n-1$
transformations.
(All that is important is that the number of transformations is finite:
that a canonical configuration is eventually reached.)
Then, $(\g-2\f)(\DD) \leq (\g-2\f)(\DDstar) \leq 0$.
\quad\qedblob
\end{proof}
A further generalization of Theorem~\ref{thm3} may be found in~\cite{BCStech}.
\shrink{
We used the following lemma, due to Landau~\cite{landau}.
\begin{lemma}\label{land}
There exists a tournament with outdegree sequence
$\deg(1) \leq \deg(2) \leq \cdots \leq \deg(n)$ if and only if, for all
$1 \leq i \leq n$,
$\sum_{j=1}^i \deg(j) \geq \sum_{j=1}^i (j-1)$,
with equality for $i=n$.
\end{lemma}
}
\section{An Upper Bound for Minimum Feedback Arc Set}\label{fas}
This section shows an analog of Theorem~\ref{thm1}
for an optimal solution to the feedback arc set problem.
(The decomposition argument in
Theorem~\ref{thm:regret} is algorithm-independent and applies here as well.)
For a tournament $T$ and an ordering $\pi$, a \emph{back edge}
is an edge $i\ra j$ in $T$ such that $j$ is ordered before $i$ in $\pi$.
Let $\back(T,\pi)$ denote the number of back edges induced by $\pi$ in $T$.
For a winner--loser partitioned tournament $\TT = (T,W,L)$
and any minimum feedback arc set ordering $\pi$ of $T$,
let $g'(\TT,\pi)$ be the number of winner--loser pairs where
the loser comes before the winner in $\pi$, and as before let
$$\f(\TT)=\suml \sumw \one(\ell \ra w)$$
be the number of
winner--loser pairs where the loser beats the winner.
\begin{theorem} \label{thm4}
For any winner--loser partitioned tournament $\TT = (T,W,L)$ and any minimum
feedback arc set ordering $\pi$ of $\TT$,
$g'(\TT,\pi)\leq 2\f(\TT)$.
\end{theorem}
\begin{proof}
Let $k_w$ be the smallest possible number of back edges in the subtournament
induced by $W$. Define $k_l$ similarly for the subtournament induced by $L$.
Let $k_w^{\pi}$ and $k_l^{\pi}$ be the number of back edges in $\pi$ that
go from $W$ to $W$ and from $L$ to $L$, respectively.
Denote the number of remaining (i.e., winner--loser or loser--winner)
back edges in $\pi$ by $k_o^{\pi}$.
Consider another ordering $\sigma$ where all winners are ordered
before all losers, and both the winners and the losers are ordered
optimally among themselves, i.e., with $k_w$ and $k_l$ back edges
respectively. The number of back edges in $\sigma$ is
$\back(\TT,\sigma) = k_w + k_l + \f(\TT)$. But we also have
$\back(\TT,\sigma) \geq \back(\TT,\pi)$ since $\pi$ minimizes the
number of back edges, and thus $k_w + k_l + \f(\TT) \geq k_w^{\pi} +
k_l^{\pi} + k_o^{\pi}$. Since $k_w \leq k_w^{\pi}$ and $k_l \leq
k_l^{\pi}$ by definition of $k_w$ and $k_l$, we have $\f(\TT) \geq
k_o^{\pi}$.
Consider any winner--loser pair with the loser ordered before the
winner. If $w\ra l$ is the edge, it is a back edge in $\pi$ and thus
is counted by $k_o^{\pi}$. If $l\ra w$ is the edge instead, it is
counted by $\f(\TT)$. Thus $g'(\TT,\pi)$ is at most $k_o^{\pi} +
\f(\TT)$. Since $\f(\TT) \geq k_o^{\pi}$, this number is never more
than $2\f(\TT)$, which implies $g'(\TT,\pi) \leq 2 \f(\TT)$. \quad \qedblob
\end{proof}
\bigskip\noindent
The proof above immediately shows that for any $\TT$ and any
ordering $\pi_{\alpha}$ of $\TT$ with at most
$(1+\alpha)\cdot\opt(\TT)$ back edges,
$g'(\TT,\pi_{\alpha})\leq (2+\alpha)\f(\TT) + \alpha(k_w + k_l)$,
where $\opt(\TT)$ is the minimum number of back edges in $\TT$,
and $k_w$ and $k_l$ are as in the proof above.
Thus, a FAS approximation does not generally guarantee
that if $f$ is 0, then so is $g'$
(a guarantee provided by \textsc{Degree} and FAS).
For example, $\TT$ may make correct statements about all winner--loser
pairs while inducing a non-transitive tournament on the winners or the losers,
so an approximation that does not know the winner--loser labeling
can incur a non-zero number of mistakes.
\section{Lower Bounds}\label{lower}
We first show that Theorem~\ref{thm1} is best possible:
the \textsc{Degree} ranking really can make twice as many
mistakes as the adversary.
% We start with a lower bound for the algorithm that orders by the number
% of wins.
Recall that $f$ denotes the number of winner--loser
pairs where the loser beats the winner, and $g$ the number of winner--loser
pairs where the loser outranks the winner.
The example below generates % shows that there exists
an infinite family
of tournaments with $g = 2f$.
\noindent
{\bf Example 1.}\quad
With $n$ odd,
let every vertex have degree~$(n-1)/2$;
note that the degree sequence $\seq{\frac{n-1}2, \ldots, \frac{n-1}2}$
does indeed respect Landau's condition, so it is realizable as a tournament.
Label $(n-1)/2$ of the vertices as winners and $(n+1)/2$ as losers.
With ties broken against us, all winners are ordered after all losers.
This gives
$f = \frac{n+1}2 \cdot \frac{n-1}2 - \binom{(n+1)/2}2 = (n+1)(n-1)/8$,
while $g = \frac{n+1}2 \cdot \frac{n-1}2 = (n+1)(n-1)/4 = 2f$.
(A similar example gives a lower bound of $2-O(1/n)$
with ties broken optimally for the algorithm.)
\quad\qedblob
In a subsequent paper, Ailon and Mohri~\cite{ailon-mehryar} give a
simple algorithm-independent example showing that no deterministic algorithm
can achieve a constant factor of less than 2 on the regret ratio. The
example puts all the probability mass on a single three element
subset. The induced tournament is a directed 3-cycle, and the bipartition is
chosen adversarily depending on the ordering output by the algorithm.
By symmetry, there are only two different orderings (clockwise and
counterclockwise). In both cases, the adversary can make the
algorithm pay for both mixed pairs while paying for only one
misordering. This example implies that Theorem~\ref{thm4} is also
best possible.\footnote{ The preliminary version of this paper
contained a construction of a family of tournaments, for
which an optimal solution to the feedback arc set problem gives $g \geq
(2-\epsilon)f$, for any $\epsilon>0$. We decided to omit the
construction since it is superseded by the above lower bound example in
\cite{ailon-mehryar}. }
\shrink{
Theorem~\ref{thm4} is also essentially best possible.
The next construction gives an infinite family of tournaments
for which an optimal solution to the feedback arc set problem has
$g \geq (2-\epsilon)f$, for any $\epsilon>0$.
% That is, even if we could solve the NP-hard feedback arc set problem,
% it would not give any advantage.
\noindent
{\bf Example 2.}\quad
Set $\delta = \frac{\epsilon}{1-\epsilon}$, and let
the set of vertices be partitioned into three components,
$V_1$, $V_2$, and $V_3$, with
$|V_1| = \delta n^2$, $|V_2| = 2n^2$, and $|V_3|=n$, for a sufficiently large
$n$. The vertices in $V_1 \cup V_2$ are the winners,
the vertices in $V_3$ are the losers.
The edges within each of the three components form acyclic tournaments.
The cross-component edges are defined as follows:
All edges between $V_3$ and $V_1$ point from $V_3$ to $V_1$, and
all edges between $V_1$ and $V_2$ point from $V_1$ to $V_2$.
To define the edges between $V_2$ and $V_3$,
divide $V_2$ into $2n$ consecutive blocks $B_1,\ldots,B_{2n}$ of $n$
vertices each, such that edges point from $B_i$ to $B_j$ where
$1\leq i0)$.
As in the bipartite case, let $\Q$ denote the distribution of label
sequences of the set $\{1,\ldots,n\}$.
Then the generalized loss of ordering $i$ before $j$ is
$$
l_{\Q}(i,j) = \mathbf{E}_{y^n\sim \Q} \frac{(y_i - y_j)_{+}}{\sum_{ui$,
$r_{\Q}(j,i)$ is proper, which in turn implies that
$r_{\Q}(i,j)=0$, concluding the proof.
\quad
\qedblob
\end{proof}
}
\end{document}
*