\documentclass[times, 10pt, twocolumn]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{latex8}
\usepackage{epsf}
\usepackage{pslatex}
\addtolength{\textfloatsep}{-0.5cm}
\addtolength{\parskip}{-0.03cm}
\renewcommand{\baselinestretch}{0.95}
%\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass specific LaTeX commands.
\theoremstyle{plain}
\newtheorem{thm}{Theorem}[section]
\newtheorem{lem}{Lemma}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{defn}{Definition}[section]
%\numberwithin{equation}{section} %% Comment out for sequentially-numbered
%\numberwithin{figure}{section} %% Comment out for sequentially-numbered
%\makeatother
\pagestyle{empty} %% remove page numbering
\begin{document}
\title{Cost-Sensitive Learning by Cost-Proportionate Example Weighting}
\author{Bianca Zadrozny, John Langford\thanks{This author's present address:
Toyota Technological Institute at Chicago, 427 East 60th Street, Second Floor - Press Building, Chicago, IL 60637.
} , Naoki Abe\\
Mathematical\ Sciences\ Department\\
IBM T.~J.~Watson Research Center \\
Yorktown Heights, NY 10598 \\
}
\maketitle
\begin{abstract}
We propose and evaluate a family of methods for converting
classifier learning algorithms and classification theory into
cost-sensitive algorithms and theory. The proposed conversion
is based on cost-proportionate weighting of the training examples,
which can be realized either by feeding the weights to the
classification algorithm (as often done in boosting), or
by careful subsampling. We give some theoretical performance
guarantees on the proposed
methods, as well as empirical evidence that they are
practical alternatives to existing approaches.
In particular, we propose {\em costing}, a method based on
cost-proportionate rejection sampling and ensemble aggregation, which
achieves excellent predictive performance on two publicly available
datasets, while drastically reducing the computation required by
other methods.
\end{abstract}
\thispagestyle{empty}
\vspace{-0.2cm}
\section{Introduction}
\vspace{-0.2cm}
%Classification learning problems naturally come with costs. For
%example, there is a cost associated with failing to predict rain and a
%cost associated with predicting rain when it is actually sunny. The
%optimal classification of ``umbrella day'' or not is inherently
%dependent on these costs.
Highly non-uniform misclassification costs are very common in
a variety of challenging real-world data mining problems, such as fraud
detection, medical diagnosis and various problems in business
decision-making. In many cases, one class is rare but the cost of not
recognizing some of the examples belonging to this class is high. In
these domains, classifier learning methods that do not take
misclassification costs into account do not perform well. In extreme
cases, ignoring costs may produce a model that is useless because it
classifies every example as belonging to the most frequent class even
though misclassifications of the least frequent class result in a very
large cost.
Recently a body of work has attempted to address this issue, with
techniques known as cost-sensitive learning in the machine
learning and data mining communities. Current cost-sensitive learning
research falls into three categories. The first
is concerned with making particular classifier learners
cost-sensitive \cite{Holte,Adacost}. The second uses Bayes risk
theory to assign each example to its lowest risk class
\cite{Domingos,ZaEl01,Margineantu}. This requires estimating class
membership probabilities and, in the case where costs are
non-deterministic, also requires estimating expected costs
\cite{ZaEl01}. The third category concerns methods for converting
arbitrary classification learning algorithms into cost-sensitive ones
\cite{Domingos}. The work described here belongs to the last category.
In particular, the approach here is akin to the pioneering work of
Domingos on MetaCost \cite{Domingos}, which also is a general method
for converting cost-sensitive learning problems to cost-insensitive
learning problems. However, the method here is distinguished by the
following properties: (1) it is even simpler; (2) it has some
theoretical performance guarantees; and (3) it does not involve any
probability density estimation in its process: MetaCost estimates
conditional probability distributions via bagging with a classifier in
its procedure, and as such it also belongs to the second category (Bayes
risk minimization) mentioned above.
The family of proposed methods is motivated by a folk theorem that is
formalized and proved in section \ref{sec-folk}. This theorem states
that altering the original example distribution \( D \) to another \(
\hat{D} \), by multiplying it by a factor proportional to the
relative cost of each example, makes any
error-minimizing classifier learner accomplish
expected cost minimization on the original distribution. Representing
samples drawn from \( \hat{D} \), however, is more challenging than it
may seem. There are two basic methods for doing this: (i) Transparent Box: Supply the
costs of the training data as example weights to the
classifier learning algorithm. (ii) Black Box: resample
according to these same weights.
While the transparent box approach cannot be applied to arbitrary
classifier learners, it can be applied to many, including any
classifier which only uses the data to calculate expectations. We
show empirically that this method gives good results. The
black box approach has the advantage that it can be applied to any
classifier learner. It turns out, however, that straightforward
sampling-with-replacement can result in severe overfitting related
to duplicate examples.
We propose, instead, to employ {\em cost-proportionate rejection
sampling} to realize the latter approach, which allows us to
independently draw examples according to \( \hat{D} \).
%(In essence,
%this method accepts each example in the input sample with probability
%proportional to its associated weight).
This method comes with a
theoretical guarantee: In the worst case it produces
a classifier that achieves at least as good approximate cost
minimization as applying the base classifier learning algorithm on the
entire sample. This is a remarkable property for a subsampling
scheme: in general, we expect any technique using only a subset of the
examples to compromise predictive performance.
The runtime savings made possible by this sampling technique enable us
to run the classification algorithm on multiple draws of subsamples
and average over the resulting classifiers. This last method is
what we call {\em costing} (cost-proportionate rejection sampling with
aggregation). Costing allows us to use an arbitrary cost-insensitive
learning algorithm as a black box in order to accomplish
cost-sensitive learning, achieves excellent predictive performance and
can achieve drastic savings of computational resources.
%, both in terms of time and space.
\section{Motivating Theory and Methods}
\vspace{-0.2cm}
\subsection{A Folk Theorem}
\vspace{-0.2cm}
\label{sec-folk}
%Standard analysis in learning theory has not incorporated a varying cost per
%example, so the model here is necessarily slightly nonstandard. In particular,
We assume that examples are drawn independently from a distribution \( D \)
with domain \( X\times Y\times C \) where \( X
\) is the input space to a classifier, \( Y \) is a (binary) output
space and \( C \subset [0,\, \, \infty ) \) is the importance (extra cost) associated with
mislabeling that example. The goal is to learn a classifier \( h:X\rightarrow Y \) which
minimizes the expected cost,
$$E_{x,y,c\sim D}[c \, I(h(x)\neq y)]$$ given training data of
the form: \( (x,y,c) \), where $I(\cdot )$ is the indicator function
that has value 1 in case its argument is true and 0 otherwise.
This model does not explicitly allow using cost information at prediction time
although $X$ might include a cost feature if that is available.
%As a side note,
This formulation of cost-sensitive learning in terms
of one number per example is more general than ``cost matrix''
formulations which are more typical in cost-sensitive learning
\cite{Elkan,Domingos}, when the output space is binary.\footnote{How
to formulate the problem in this way when the output space is not
binary is nontrivial and is beyond the scope of this paper.} In the
cost matrix formulation, costs are associated with false negative,
false positive, true negative, and true positive predictions. Given
the cost matrix and an example, only two entries (false positive, true
negative) or (false negative, true positive) are relevant for that
example. These two numbers can be further reduced to one: (false
positive - true negative) or (false negative - true positive), because
it is the difference in cost between classifying an example correctly
or incorrectly which controls the importance of correct
classification. This difference is the importance $c$ we use here.
This setting is more general in the sense that the importance may vary on a
example-by-example basis.
A basic folk theorem \footnote{ We say ``folk theorem'' here because
the result appears to be known by some and
it is straightforward to derive it from results in decision theory, although we have
not found it published. }
states that if we have examples drawn from the distribution:
\[
\hat{D}(x,y,c)\equiv \frac{c}{E_{x,y,c\sim D}[c]}D(x,y,c)\]
then optimal error rate classifiers for \( \hat{D} \) are optimal cost
minimizers for data drawn from \( D \).
\begin{thm}
\label{th:translate}(Translation Theorem)
For all distributions, $D$, there exists a constant $N = E_{x,y,c\sim D}[c]$ such that for all classifiers, $h$:
\[
E_{x,y,c\sim \hat{D}}[I(h(x)\neq y)] = \frac{1}{N} \, E_{x,y,c\sim D}[c \, I(h(x)\neq y)]
\]
\end{thm}
\begin{proof}\begin{eqnarray}
E_{x,y,c\sim D}[c \, I(h(x)\neq y)]
& = & \sum _{x,y,c}D(x,y,c)c \, I(h(x)\neq y) \nonumber\\
& = & N\sum _{x,y,c}\hat{D}(x,y,c)I(h(x)\neq y) \nonumber\\
& = & N E_{x,y,c\sim \hat{D}}[I(h(x)\neq y)]\nonumber\\
\mbox{where} & & \hat{D}(x,y,c)=\frac{c}{N}D(x,y,c).\nonumber
\end{eqnarray} \vspace{-0.3cm}
\end{proof}
Despite its simplicity, this theorem is useful to
us because the right-hand side expresses the expectation we want to
control (via the choice of $h$) and the left-hand side is the
probability that $h$ errs under another distribution. Choosing $h$
to minimize the rate of errors under $\hat{D}$ is equivalent to
choosing $h$ to minimize the expected cost under $D$. Similarly,
$\epsilon$-approximate error minimization under $\hat{D}$ is
equivalent to $N\epsilon$-approximate cost minimization under
$D$.
The prescription for coping with cost-sensitive problems is
straightforward: re-weight the distribution in your training set
according to the importances so that the training set is
effectively drawn from \( \hat{D} \). Doing this in a correct and
general manner is more challenging than it may seem and is the topic
of the rest of the paper.
\subsection{Transparent Box: Using Weights Directly}
\vspace{-0.2cm}
\label{sec-weights}
\subsubsection{General conversion}
Here we examine how importance weights can be used
%The approach taken here is a transparent box approach where
%access to the source code is required, and \emph{not} a black box approach
%(which we develop in the next section). In particular,
%we use the weights
within different learning algorithms to accomplish cost-sensitive
classification. We call this the transparent box approach because it requires knowledge
of the particular learning algorithm (as opposed to the black box approach that we
develop later).
The mechanisms for realizing the transparent box approach have been
described elsewhere for a number of weak learners used in boosting,
but we will describe them here for completeness.
%However, the ability to take a weight vector as input is \emph{not}
%sufficient.
%Naoki: I think that the weak learners in boosting are supposed to
%minimize errors with respect to the distribution specified by the weights.
The classifier learning algorithm must use the weights so that it
effectively learns from data drawn according to \( \hat{D} \). This
requirement is easy to apply for all learning algorithms
which fit the statistical query model \cite{SQ}.
\begin{figure}
\begin{center}
\epsfxsize=6cm
\leavevmode\epsfbox{sq_model.eps}
\vspace{-0.2cm}
\caption{\label{fig:sq}The statistical query model.
%Many learning algorithms have a data-independent portion which forms functions
%(queries) for which an expected value is calculated on the training data in
%a ``query oracle''.
%For any learning algorithm decomposable in this form,
%there is a generic method for learning from a re-weighted distribution directly.
}
\end{center}
\end{figure}
As shown in figure \ref{fig:sq}, many learning algorithms can be
divided into two components: a portion which calculates the
(approximate) expected value of some function (or query) $f$ and
a portion which forms these queries and uses their output to construct
a classifier. For example, neural networks,
%(with batch-mode gradient updates)
decision trees, and Naive Bayes classifiers can be constructed in this manner.
Support vector machines are not easily constructible in this way,
because the individual classifier is explicitly dependent upon
individual examples rather than on statistics derived from the entire sample.
With finite data we cannot precisely calculate the expectation \(
E_{x,y\sim D}[f(x,y)] \). With high probability, however, we can
approximate the expectation given a set of examples drawn
independently from the underlying distribution \( D \).
Whenever we have a learning algorithm that can be decomposed as in
figure \ref{fig:sq}, there is a simple recipe for using the weights
directly. Instead of simulating the expectation with \(
\frac{1}{|S|}\sum _{(x,y)\in S}f(x,y) \), we use \( \frac{1}{\sum
_{(x,y,c)\in S}c}\sum _{(x,y,c)\in S}cf(x,y) \). This method is
equivalent to importance sampling for \( \hat{D} \) using the
distribution \( D \), and so the modified expectation is an unbiased
Monte Carlo estimate of the expectation w.r.t. \( \hat{D} \).
Even when a learning algorithm does not fit this model, it may be
possible to incorporate importance weights
directly. We now discuss how to incorporate importance weights into
some specific learning algorithms.
\subsubsection{Naive Bayes and boosting}
Naive Bayes learns by calculating empirical probabilities for each output $y$ using Bayes' rule
and assuming that each feature is independent given the output:
$$P(y|x) = \frac{P(x|y)P(y)}{P(x)} = \frac{\prod_i P(x_i|y)P(y)}{\prod_i P(x_i)}$$
Each probability estimate in the above expression can be thought of as
a function of empirical expectations according to $D$, and thus it
can be formulated in the statistical query model.
For example, $P(x_i|y)$ is just the expectation of $I({\bf x_i}=x_i) \wedge I({\bf y} = y)$
divided by the expectation of $I({\bf y} = y)$.
More specifically, to compute the empirical estimate of $P(x_i|y)$ with respect to $D$,
we need to count the number of training examples that have $y$ as output, and those having $x_i$ as
the $i$-th input dimension among those. When we compute these empirical estimates with respect
to $\hat{D}$, we simply have to sum the weight of each example, instead of counting the examples.
(This property is used in the implementation of boosted Naive Bayes
\cite{Bnb}.)
To incorporate importance weights into AdaBoost \cite{Adaboost},
we give the importance weights to the weak learner in the first iteration, thus
effectively drawing examples from $\hat{D}$. In the subsequent
iterations, we use the standard AdaBoost rule to update the
weights. Therefore, the weights are adjusted according to the accuracy
on $\hat{D}$, which corresponds to the expected cost on $D$.
\iffalse
Note that AdaCost \cite{Adacost}, a variant of AdaBoost for
cost-sensitive learning, has also been proposed. AdaCost uses a
modified update rule to incorporate costs and improved
performance is observed. In contrast,
Proposition~\ref{th:translate} implies that such a modification is
not necessary if we start with examples drawn from $\hat{D}$. This
may seem contradictory, but note that Proposition~\ref{th:translate} is
purely about error translation and \emph{not} about learning algorithm
design. From the viewpoint of the proposition, AdaCost is a
learning algorithm with a different bias than the AdaBoost bias. Just
as other boosting algorithms such as LogitBoost \cite{Logitboost} are
sometimes superior to AdaBoost, AdaCost may be sometimes superior to
AdaBoost.
%Note that AdaCost
%performance on our problem is similar to reweighted Adaboost (although
%computationally, fewer iterations are required).
%Commented out by Bianca:it's confusing to talk about experiments here
%maybe we can add something later on, but I am not confident that my experiments
%with AdaCost are correct.
\fi
\subsubsection{C4.5}
C4.5 \cite{C4.5} is a widely used decision tree learner. There is a standard way of
incorporating example weights to it, which in the original algorithm was intended
to handle missing attributes (examples with missing attributes were divided into fractional examples,
each with a smaller weight, during the growth of the tree). This same facility was
later used by Quinlan in the implementation of boosted C4.5 \cite{Qu96}.
\subsubsection{Support Vector Machine}
The SVM algorithm \cite{Joachims}
learns the parameters $a$ and $b$ describing a linear decision rule
$h(x) = \mbox{sign}(a\cdot x + b)$,
so that the smallest distance between each training example and the decision boundary
(the margin) is maximized. It works by solving the following optimization problem:
$$\begin{array}{l}
\mbox{minimize:}~V(a,b,\xi)=\frac{1}{2}a\cdot a + C \sum_{i=1}^{n} \xi_i\\
\mbox{subject to:}~~ \forall i: ~~y_i[a \cdot x_i + b]\geq 1 - \xi_i, ~\xi_i > 0 \\
\end{array}
$$ The constraints require that all examples in the training set are
classified correctly up to some slack $\xi_i$. If a training example
lies on the wrong side of the decision boundary, the corresponding
$\xi_i$ is greater than 1. Therefore, $\sum_{i=1}^{n} \xi_i$ is an
upper bound on the number of training errors. The factor $C$ is a
parameter that allows one to trade off training error and model
complexity. The algorithm can be generalized to non-linear decision
rules by replacing inner products with a kernel function
in the formulas above.
The SVM algorithm does not fit the statistical query model.
Despite this, it is possible to incorporate
importance weights in a natural way.
First, we note that
$\sum_{i=1}^{n} c_i \xi_i$, where $c_i$ is the importance of example
$i$, is an upper bound on the total cost. Therefore, we can modify
$V(a,b,\xi)$ to
$$\begin{array}{l}
V(a,b,\xi)=\frac{1}{2}a\cdot a + C \sum_{i=1}^{n} c_i \xi_i.
\end{array}
$$
Now $C$ controls model complexity versus total cost.
The SVMLight package \cite{SVMLight} allows users to input weights
$c_i$ and works with the modified $V(a,b,\xi)$ as above, although this feature
has not yet been documented.
\subsection{Black Box: Sampling methods}
\label{sec-resample}
Suppose we do not have transparent box access to the learner.
In this case, sampling is the obvious method to convert from one
distribution of examples to another to obtain a
cost-sensitive learner using the translation theorem (Theorem~\ref{th:translate}). As it turns out,
straightforward sampling does not work well in this case,
motivating us to propose an alternative method based on rejection sampling.
\subsubsection{Sampling-with-replacement}
Sampling-with-replacement is a sampling scheme where each example \( (x,y,c) \) is drawn
according to the distribution \( p(x,y,c)=\frac{c}{\sum _{(x,y,c)\in S}c} \).
Many examples are drawn to create a new dataset \( S' \). This method,
at first pass, appears useful because every example is effectively drawn from
the distribution \( \hat{D} \).
In fact, very poor performance can result when using this technique, which is essentially
due to overfitting because of the fact that the examples in \( S' \) are not drawn
\emph{independently} from \( \hat{D} \), as we will elaborate in the section on
experimental results (Section~\ref{section:results}).
Sampling-without-replacement is also not a solution to this problem.
In sampling-without-replacement, an example \( (x,y,c) \) is drawn
from the distribution \( p(x,y,c)=\frac{c}{\sum _{(x,y,c)\in S}c} \)
and the next example is drawn from the set \( S-\{x,y,c\} \). This
process is repeated, drawing from a smaller and smaller set according
to the weights of the examples remaining in the set.
To see how this method fails, note that sampling-without-replacement
\( m \) times from a set of size \( m \) results in the
original set, which (by assumption) is drawn from the distribution \(
D \), and not \( \hat{D} \) as desired.
\subsubsection{Cost-proportionate rejection sampling}
There is another sampling scheme called rejection sampling \cite{VonNeumann}
which allows us to draw examples independently from the distribution
\( \hat{D} \), given examples drawn independently from \( D \).
In rejection sampling, examples from \( \hat{D}
\) are obtained by first drawing examples from \( D \), and then keeping (or accepting)
the sample with probability proportional to \( \hat{D}/D \).
Here, we have \( \hat{D}/D \propto c \), so we accept an example
with probability \( c/Z \), where \( Z \) is some
constant chosen so that \( \max _{(x,y,c)\in S}c\leq Z \),\footnote{In
practice, we choose $Z= \max _{(x,y,w)\in S}c$ so as
to maximize the size of the set $S'$. A data-dependent choice
of $Z$ is \emph{not} formally allowed for rejection sampling.
However, the introduced bias appears small when $|S|>>1$. A precise
measurement of ``small'' is an interesting theoretical problem. }
leading to the name {\em cost-proportionate rejection sampling}.
Rejection sampling results in a set \( S' \) which is
generally smaller than \( S \). Furthermore, because inclusion of an
example in \( S' \) is independent of other examples, and the examples in
\( S \) are drawn independently, we know that the examples in \( S' \)
are distributed independently according to \( \hat{D} \).
Using cost-proportionate rejection sampling to create a set $S'$ and
then using a learning algorithm $A(S')$ is guaranteed to produce an
approximately cost-minimizing classifier, as long as the learning
algorithm \( A \) achieves approximate minimization of classification error.
\begin{thm}
(Correctness) For all cost-sensitive sample sets $S$, if cost-proportionate
rejection sampling produces a sample set $S'$ and $A(S')$ achieves $\epsilon$
classification error:
\[ E_{x,y,c \sim \hat{D}} [I(h(x)\neq y)] \leq \epsilon
\]
then $h=A(S')$ approximately minimizes cost:
\vspace{-0.1cm}
\[
E_{x,y,c \sim D} [c \, I(h(x)\neq y)] \leq \epsilon N
\]
where $N= E_{x,y,c\sim D}[c]$.
\label{thmcorrect}
\end{thm}
\begin{proof}
Rejection sampling produces a sample set \( S' \) drawn independently
from \( \hat{D} \). By assumption \( A(S') \) outputs a classifier $h$ such that
\vspace{-0.3cm}
$$
E_{x,y,c \sim \hat{D}} [I(h(x)\neq y)] \leq \epsilon
$$
By the translation theorem (Theorem~\ref{th:translate}), we know that
\vspace{-0.3cm}
$$
E_{x,y,c \sim \hat{D}} [I(h(x)\neq y)] = \frac{1}{N} \, E_{x,y,c\sim D}[c \, I(h(x)\neq y)]
$$
Thus,
\vspace{-0.3cm}
%$$
%\frac{1}{N} \, E_{x,y,c\sim D}[c \, I(h(x)\neq y)] \leq \epsilon
%$$
%and
$$
E_{x,y,c\sim D}[c \, I(h(x)\neq y)] \leq \epsilon N.
$$
\vspace{-0.2cm}
\end{proof}
\vspace{-0.5cm}
\subsubsection{Sample complexity of cost-proportionate rejection sampling}
The accuracy of a learned classifier generally improves monotonically
with the number of examples in the training set. Since
cost-proportionate rejection sampling produces
a smaller training set (by a factor of about $N/Z$), one would
expect worse performance than using the entire training set.
This turns out to not be the case, in the {\em agnostic PAC-learning
model} \cite{Valiant,agnostic}, which formalizes the notion of
probably approximately optimal learning from arbitrary
distributions $D$.
\begin{defn}
A learning algorithm $A$ is said to be an {\em agnostic PAC-learner}
for hypothesis class $H$, with sample complexity $m(1/\epsilon,1/\delta)$
if for all $\epsilon > 0$ and $\delta > 0$,
$m = m(1/\epsilon,1/\delta)$ is the least sample size such that
for all distributions $D$ (over $X \times Y$), the classification
error rate of its output $h$ is at most $\epsilon$ more than the best
achievable by any member of $H$ with probability at least $1 -
\delta$, whenever the sample size exceeds $m$.
\end{defn}
By analogy, we can formalize the notion of {\em cost-sensitive}
agnostic PAC-learning.
\begin{defn}
A learning algorithm $A$ is said to be a
{\em cost-sensitive agnostic PAC-learner} for hypothesis class $H$,
with {\em cost-sensitive} sample complexity $m(1/\epsilon,1/\delta)$,
if for all $\epsilon > 0$ and $\delta > 0$,
$m = m(1/\epsilon,1/\delta)$ is the least sample size such that
for all distributions $D$ (over $X \times Y \times C$), the expected cost
of its output $h$ is at most $\epsilon$ more than the best
achievable by any member of $H$ with probability at least $1 -
\delta$, whenever the sample size exceeds $m$.
\end{defn}
We will now use this formalization to compare the cost-sensitive
PAC-learning sample complexity of two methods:
applying a given base classifier learning algorithm
to a sample obtained through cost-proportionate rejection sampling,
and applying the same algorithm on the original training set.
We show that the cost-sensitive sample complexity of the latter method
is lower-bounded by that of the former.
\begin{thm} (Sample Complexity Comparison)
Fix an arbitrary base classifier learning algorithm $A$, and
suppose that $m_\textrm{orig}(1/\epsilon, 1/\delta)$ and
$m_\textrm{rej}(1/\epsilon, 1/\delta)$, respectively,
are cost-sensitive sample complexity of
applying $A$ on the original training set,
and that of applying $A$ with cost-proportionate rejection sampling.
Then, we have
\[
m_{\textrm{orig}}(1/\epsilon, 1/\delta)
= \Omega ( m_{\textrm{rej}}(1/\epsilon, 1/\delta)).
\]
\end{thm}
\begin{proof}
Let $m(1/\epsilon, 1/\delta)$ be the (cost-insensitive) sample complexity
of the base classifier learning algorithm $A$.
(If no such function exists, then neither
$m_{\textrm{orig}}(1/\epsilon, 1/\delta)$ nor
$m_{\textrm{rej}}(1/\epsilon, 1/\delta)$ exists,
and the theorem holds vacuously.)
Since $Z$ is an upper bound on the cost of misclassifying an example,
we have that the cost-sensitive sample complexity of
using the original training set satisfies
$$
m_{\textrm{orig}}(1/\epsilon, 1/\delta) = \Theta(m(Z/\epsilon, 1/\delta))
$$
This is because given a distribution
%(over $X \times Y$)
that forces $\epsilon$ more classification error than optimal,
another distribution
%(over $X \times Y \times C$)
can be constructed, that forces $\epsilon Z$ more cost than optimal,
by assigning cost $Z$ to all examples on which $A$ errs.
Now from Theorem~\ref{thmcorrect} and noting that the central limit
theorem implies that cost-proportionate rejection sampling reduces
the sample size by a factor of $\Theta(N/Z)$,
the cost-sensitive sample complexity for rejection
sampling is:
\begin{equation}
\label{eqngrej}
m_{\textrm{rej}}\left( 1/\epsilon, 1/\delta \right) =
\Theta\left( \frac{Z}{N} m\left(N/\epsilon,
1/\delta\right) \right).
\end{equation}
A fundamental theorem from PAC-learning theory states that
$m(1/\epsilon,1/\delta) = \Omega((1/\epsilon) \ln
(1/\delta))$ \cite{EHKV}. When $m(1/\epsilon,1/\delta) =
\Theta((1/\epsilon) \ln (1/\delta))$, Equation~(\ref{eqngrej})
implies:$$
%\begin{eqnarray*}
m_{\textrm{rej}}\left( 1/\epsilon, 1/\delta \right)
%& = & \Theta\left(\max \left\{\frac{Z}{N} \cdot f\left(\frac{\epsilon}{N}, \delta\right),
%\log 1/\delta\right\}\right)
%\:\:\mbox{Eq~(\ref{eqngrej})}\\
= \Theta\left(\frac{Z}{N} \frac{N}{\epsilon} \ln (1/\delta) \right)
%& = & \Theta\left(\frac{Z}{N} \cdot N \cdot f\left(\epsilon, \delta\right)\right)
%\:\:\mbox{by (super)linearity of $f$ in $1/\epsilon$} \\
%& = & \Theta(Z \cdot f(\epsilon, \delta)) \\
%& = & \Theta\left(\frac{Z}{\epsilon} \ln (1/\delta) \right) \\
%& = & \Theta\left(m_{\textrm{orig}}\left(1/\epsilon, 1/\delta\right)\right)
= \Theta\left(m_{\textrm{orig}}\left(1/\epsilon, 1/\delta\right)\right)
%\:\:\mbox{Equation~(\ref{eqngtri})}
%\label{eqnrt1}
%\end{eqnarray*}
$$
Finally, note that when $m(1/\epsilon, 1/ \delta)$ grows faster than
linear in $1/\epsilon$, we have $ m_{\textrm{rej}}(1/\epsilon,
1/\delta) = o(m_{\textrm{orig}}(1/\epsilon, 1/\delta))$, which
finishes the proof.
\end{proof}
\vspace{-0.1cm}
Note that the linear dependence of sample size on $1/\epsilon$ is only
achievable by an ideal learning algorithm, and in practice
super-linear dependence is expected, especially in the presence of
noise. Thus, the above theorem implies that {\em cost-proportionate
rejection sampling} minimizes cost better than no sampling for worst
case distributions.
This is a remarkable property about any sampling scheme, since one generally
expects that predictive performance is compromised by using a smaller sample.
%Sorry but I took out the following because it seems unnecessry in an
%informal discussion. - N. A.
%For worst-case distributions,
Cost-proportionate rejection sampling seems to
{\em distill} the original sample and obtains a sample of
smaller size, which is at least as informative as the original.
\subsubsection{Cost-proportionate rejection sampling with aggregation (costing)}
\label{sec-costing}
From the same original training sample, different runs of cost-proportionate
rejection sampling will produce different training samples.
Furthermore, the fact that rejection sampling produces very small samples
means that the time required for learning a classifier is generally much smaller.
We can take advantage of these properties to devise an ensemble learning algorithm based on
repeatedly performing rejection sampling from $S$ to
produce multiple sample sets $S_1',...,S_m'$, and then learning a classifier for
each set. The output classifier is the average over all learned classifiers.
We call this technique {\em costing}:
\vspace{5pt}
\textbf{Costing(Learner \( A \), Sample Set \( S \), count \( t \))}
\begin{enumerate}
\item \textbf{For \( i=1 \) to \( t \) do}
\begin{enumerate}
\item \textbf{\( S'= \) rejection sample from \( S \) with acceptance probability $c/Z$.}
\item \textbf{Let \( h_{i}\equiv A(S') \)}
\end{enumerate}
\item \textbf{Output \( h(x)=\textrm{sign}\left( \sum _{i=1}^{t}h_{i}(x)\right) \)}
\end{enumerate}
The goal in averaging is to improve performance. There is both
empirical and theoretical evidence suggesting that averaging
can be useful. On the empirical side, many people have observed good
performance from bagging despite throwing away a $1/e$ fraction of the
samples. On the theoretical side, there has been considerable work which proves
that the ability to overfit of an average of
classifiers might be smaller than naively expected when a large margin
exists. The preponderance of learning algorithms producing averaging
classifiers provides significant evidence that averaging is useful.
Note that despite the extra computational cost of averaging, the
overall computational time of costing is generally much smaller than that of a
learning algorithm using sample set $S$ (with or without weights). This
is the case because most learning algorithms have running times that
are superlinear in the number of examples.
\section{Empirical evaluation}
\label{section:results}
We show empirical results using two real-world datasets. We selected
datasets that are publicly available and for which
cost information is available on a per example basis. Both datasets
are from the direct marketing domain. Although there are many other
data mining domains that are cost-sensitive, such as credit card fraud detection
and medical diagnosis, publicly available data are
lacking.
\subsection{The datasets used}
\vspace{-0.2cm}
\subsubsection{KDD-98 dataset}
This is the well-known and challenging
dataset from the KDD-98 competition, now available at the UCI KDD
repository \cite{KDD}. The dataset contains information about persons
who have made donations in the past to a particular charity. The
decision-making task is to choose which donors to mail a request for a
new donation. The measure of success is the total profit obtained in
the mailing campaign.
The dataset is divided in a fixed way into a training set and a
test set. Each set consists of approximately 96000 records for which it is known
whether or not the person made a donation and how much
the person donated, if a donation was made. The overall percentage of donors is about 5\%.
Mailing a solicitation to an individual costs the charity $\$0.68$.
The donation amount for persons who respond varies from \$1 to \$200.
The profit obtained by soliciting every individual in the test set is \$10560,
while the profit attained by the winner of the KDD-98 competition was \$14712.
The importance of each example is the absolute difference in profit between mailing
and not mailing an individual. Mailing results in the donation amount
minus the cost of mailing. Not mailing results in zero profit.
Thus, for positive examples (respondents), the importance varies from
$\$0.32$ to $\$199.32$. For negative examples (non-respondents), it is fixed at $\$0.68$.
\subsubsection{DMEF-2 dataset}
This dataset can be obtained from the DMEF dataset library \cite{DMEF}
for a nominal fee. It contains customer buying
history for 96551 customers of a nationally known catalog. The decision-making
task is to choose which customers should receive a new catalog so as to maximize
the total profit on the catalog mailing campaign. Information on the cost of
mailing a catalog is not available, so we fixed it at \$2.
The overall percentage of respondents is about 2.5\%. The purchase amount for customers
who respond varies from \$3 to \$6247. As is the case for the KDD-98 dataset, the importance
of each example is the absolute difference in profit between mailing and not mailing a customer.
Therefore, for positive examples (respondents), the
importance varies from \$1 to \$6245. For negative examples (non-respondents), it is fixed at \$2.
We divided the dataset in half to create a training set and a test set. As a baseline
for comparison, the profit obtained by mailing a catalog to every individual on the training set is
\$26474 and on the test set is \$27584.
\subsection{Experimental results}
\vspace{-0.2cm}
\subsubsection{Transparent box results}
\begin{table}
\begin{center}
KDD-98:\\
\vspace{3pt}
\begin{tabular}{|l|c|c|}\hline
Method & Without Weights & With Weights\\ \hline
Naive Bayes & 0.24 & 12367 \\
Boosted NB & -1.36 & 14489 \\
C4.5 & 0 & 118 \\
SVMLight & 0 & 13683 \\ \hline
\end{tabular}
\vspace{5pt}
DMEF-2:\\
\vspace{3pt}
\begin{tabular}{|l|c|c|}\hline
Method & Without Weights & With Weights\\ \hline
Naive Bayes & 16462 & 32608 \\
Boosted NB & 121 & 36381 \\
C4.5 & 0 & 478 \\
SVMLight & 0 & 36443 \\ \hline
\end{tabular}
\vspace{-0.4cm}
\end{center}
\caption{Test set profits with transparent box.}
\label{table:resultsWeights}
\end{table}
Table \ref{table:resultsWeights} (top) shows the results for
Naive Bayes, boosted Naive Bayes (100 iterations) C4.5 and
SVMLight on the KDD-98 and DMEF-2 datasets, with and without the
importance weights. Without the importance weights,
the classifiers label very few of the examples positive, resulting in small (and even negative)
profits. With the costs given as weights to the learners, the results
improve significantly for all learners, except C4.5.
Cost-sensitive boosted Naive Bayes gives
results comparable to the best so far with this dataset \cite{ZaEl01}
using more complicated methods.
We optimized the parameters of the SVM by cross-validation on the
training set. Without weights, no setting of the parameters prevented
the algorithm of labeling all examples as negatives.
With weights, the best parameters were a polynomial kernel with degree 3
and $C=5\times10^{-5}$ for KDD-98 and a linear kernel with $C=0.0005$ for DMEF-2.
However, even with this parameter setting, the results are not so impressive.
This may be a hard problem for margin-based classifiers because the
data is very noisy. Note also that running SVMLight on this dataset
takes about 3 orders of magnitude longer than AdaBoost with 100
iterations.
The failure of C4.5 to achieve good profits with importance weights is probably related
to the fact that the facility for incorporating weights provided in the algorithm is
heuristic. So far, it has been used only in situations where the weights are fairly uniform
(such as is the case for fractional instances due to missing data). These results indicate that
it might not be suitable for situations with highly non-uniform costs. The fact that it is
non-trivial to incorporate costs directly into existing learning algorithms is the motivation for the
black box approaches that we present here.
\begin{table*}
\begin{center}
{\small KDD-98:
\vspace{2pt}
\begin{tabular}{|c||c|c||c|c||c|c|} \hline
& \multicolumn{2}{|c||}{1000}&\multicolumn{2}{|c||}{10000} &\multicolumn{2}{|c|}{100000}\\ \hline
& Training & Test & Training & Test & Training & Test\\ \hline
NB & 11251 (330) & 10850 (325) & 12811 (155) & 11993 (185) & 12531 (242) & 12026 (256)\\
BNB & 11658 (311) & 11276 (383) & 13838 (65) & 12886 (212) & 14107 (152) & 13135 (159)\\
C4.5 & 11124 (255) & 9548 (331) & 22083 (271) & 7599 (310) & 40704 (152) & 2259 (107)\\
SVM & 10320 (372) & 10131 (281) & 11228 (182) & 11015 (161) & 13565 (129) & 12808 (220)\\ \hline
\end{tabular}
\vspace{4pt}
DMEF-2:
\vspace{2pt}
\begin{tabular}{|c||c|c||c|c||c|c|} \hline
& \multicolumn{2}{|c||}{1000}&\multicolumn{2}{|c||}{10000} &\multicolumn{2}{|c|}{100000}\\ \hline
& Training & Test & Training & Test & Training & Test\\ \hline
NB & 33298 (495) & 34264 (419) & 32742 (793) & 33956 (798) & 33511 (475) & 34506 (405)\\
BNB & 33902 (558) & 30304 (660) & 34802 (806) & 31342 (772) & 34505 (822) & 31889 (733)\\
C4.5 & 37905 (1467) & 24011 (1931) & 67960 (763) & 9188 (458) & 72574 (1205) & 3149 (519)\\
SVM & 28837 (1029) & 30177 (1196) & 31263 (1121) & 32585 (891) & 34309 (719) & 33674 (600)\\
\hline
\end{tabular}}
\vspace{-0.4cm}
\end{center}
\caption{Profits using sampling-with-replacement.}
\label{table:resampling}
\end{table*}
\subsubsection{Black box results}
Table~\ref{table:resampling} shows the results of applying the same learning
algorithms to the KDD-98 and DMEF-2 data using training sets of
different sizes obtained by sampling-with-replacement.
For each size, we repeat the experiments 10 times with different sampled
sets to get mean and standard error (in parentheses). The training set profits are on
the original training set from which we draw the sampled sets.
The results confirm that application of sampling-with-replacement to implement the
black box approach can result in very poor performance due to overfitting.
When there are large differences in the magnitude of importance weights, it is typical
for an example to be picked twice (or more). In table \ref{table:resampling}, we see
that as we increase the sampled training set size and, as a consequence, the number
of duplicate examples in the training set, the training profit becomes larger
while the test profit becomes smaller for C4.5.
Examples which appear multiple times in the training set of a learning
algorithm can defeat complexity control mechanisms built into learning
algorithms
%\footnote{ This observation has implications for the
%practice of bagging, although the loss of complexity control is not as
%severe as observed here. Uniform sampling-with-replacement from a set of size \( m
%\), \( m \) times produces only about \( \frac{m}{e} \)
%duplicates. Since (with high probability) not all examples are
%duplicates, complexity control in bagging is only weakened and not
%removed. We note that Fan et al \cite{Framework} have proposed a
%modification to bagging, in which partitioning is used in place of
%sampling to address this issue.}.
For example, suppose that we have
a decision tree algorithm which divides the training data into a
``growing set'' (used to construct a tree) and a ``pruning set'' (used
to prune the tree for complexity control purposes). If the pruning set
contains examples which appear in the growing set, the complexity
control mechanism is defeated.
Although not as markedly as for C4.5, we see the same phenomenon for the other
learning algorithms. In general, as the size of the resampled size grows, the larger is the
difference between training set profit and test set profit. And, even with
100000 examples, we do not obtain the same test set results as giving the
weights directly to Boosted Naive Bayes and SVM.
The fundamental difficulty here is that the samples in \( S' \) are not drawn
\emph{independently} from \( \hat{D} \). In particular, if \( \hat{D} \) is
a density, the probability of observing the same example twice given independent
draws is \( 0 \), while the probability using sampling-with-replacement
is greater than \( 0 \).
Thus sampling-with-replacement fails because the sampled set \( S' \) is
not constructed independently.
Figure \ref{fig:costing} shows the results of costing on
the KDD-98 and DMEF-2 datasets, with the
base learners and $Z=200$ or $Z=6247$, respectively. We repeated the experiment
10 times for each $t$ and calculated the mean and standard error of the profit. The results
for $t=1$, $t=100$ and $t=200$ are also given in table \ref{table:costing}.
In the KDD-98 case, each resampled set has only about 600 examples, because
the importance of the examples varies from 0.68 to 199.32 and there
are few ``important'' examples. About 55\% of the examples in each set are
positive, even though on the original dataset the percentage of positives is only 5\%.
With $t=200$, the C4.5 version yields profits around
\$15000, which is exceptional performance for this dataset.
In the DMEF-2 case, each set has only about 35 examples, because the importances vary even more
widely (from 2 to 6246) and there are even fewer examples with a large importance
than in the KDD-98 case. The percentage of positive examples in each set is about 50\%, even
though on the original dataset it was only 2.5\%.
For learning the SVMs, we used the same kernels as we did in section \ref{sec-weights} and
the default setting for $C$. In that section, we saw that
by feeding the weights directly to the SVM, we obtain a profit of
\$13683 on the KDD-98 dataset and of \$36443 on the DMEF-2 dataset. Here,
we obtain profits around \$13100 and \$35000, respectively. However,
this did not require parameter optimization and, even with $t=200$,
was much faster to train. The reason for the speed-up is that the time
complexity of SVM learning is generally superlinear in the number of
training examples.
\begin{table}
\begin{center}
{\small KDD-98:
\vspace{2pt}
\begin{tabular}{|c|c|c|c|} \hline
& 1 & 100 & 200\\ \hline
NB & 11667 (192) & 13111 (102) & 13163 (68) \\
BNB & 11377 (263) & 14829 (92) & 14714 (62) \\
C4.5 & 9628 (511) & 14935 (102) & 15016 (61) \\
SVM & 10041 (393) & 13075 (41) & 13152 (56) \\ \hline
\end{tabular}
\vspace{4pt}
DMEF-2:
\vspace{2pt}
\begin{tabular}{|c|c|c|c|} \hline
& 1 & 100 & 200\\ \hline
NB & 26287 (3444) & 37627 (335) & 37629 (139) \\
BNB & 24402 (2839) & 37376 (393) & 37891 (364) \\
C4.5 & 27089 (3425) & 36992 (374) & 37500 (307) \\
SVM & 21712 (3487) & 33584 (1215) & 35290 (849) \\
\hline
\end{tabular}}
\vspace{-0.4cm}
\end{center}
\caption{Test set profits using costing.}
\label{table:costing}
\end{table}
\begin{figure*}
\begin{center}
KDD-98:
\vspace{2pt}
\epsfxsize=4.2cm
\leavevmode\epsfbox{nb_kdd.eps}
\epsfxsize=4.2cm
\leavevmode\epsfbox{bnb_kdd.eps}
\epsfxsize=4.2cm
\leavevmode\epsfbox{c45_kdd.eps}
\epsfxsize=4.2cm
\leavevmode\epsfbox{svm_kdd.eps}
DMEF-2:
\vspace{2pt}
\epsfxsize=4.2cm
\leavevmode\epsfbox{nb_dmef.eps}
\epsfxsize=4.2cm
\leavevmode\epsfbox{bnb_dmef.eps}
\epsfxsize=4.2cm
\leavevmode\epsfbox{c45_dmef.eps}
\epsfxsize=4.2cm
\leavevmode\epsfbox{svm_dmef.eps}
\vspace{-0.2cm}
\end{center}
\caption{Costing: test set profit vs. number of sampled sets.}
\label{fig:costing}
\end{figure*}
\section{Discussion}
Costing is a technique which produces a cost-sensitive classification from a
cost-insensitive classifier using only black box access. This simple method
is fast, results in excellent performance and
often achieves drastic savings in computational resources, particularly
with respect to space requirements.
This last property is especially desirable in applications of
cost-sensitive learning to domains that involve massive amount of data,
such as fraud detection, targeted marketing, and intrusion detection.
Another desirable property of any reduction is that it applies to the
theory as well as to concrete algorithms.
Thus, the reduction presented in the present paper allows us to
automatically apply any future results in cost-insensitive classification
to cost-sensitive classification.
For example, a bound on the future error rate of \( A(S') \)
implies a bound on the expected cost with respect to the distribution \( D \).
This additional property of a reduction is especially important because
cost-sensitive learning theory is still young and relatively unexplored.
One direction for future work is multiclass cost-sensitive learning.
If there are $K$ classes, the minimal representation of costs is $K-1$
weights. A reduction to cost-insensitive classification using these
weights is an open problem.
\small
\begin{thebibliography}{1}
\bibitem{DMEF} Anifantis, S. The DMEF Data Set Library.
The Direct Marketing Association, New York, NY, 2002.
\texttt{[http://www.the-dma.org/dmef/dmefdset.shtml]}
\bibitem{Domingos} Domingos, P. ~{MetaCost}: A general method
for making classifiers cost sensitive. {\em Proceedings of the 5th International Conference on
Knowledge Discovery and Data Mining}, 155-164, 1999.
\bibitem{Holte} Drummond, C. \& Holte, R. Exploiting the cost
(in)sensitivity of decision tree splitting criteria. {\em Proceedings of the 17th
International Conference on Machine Learning}, 239-246, 2000.
\bibitem{EHKV} Ehrenfeucht, A., Haussler, D., Kearns, M. \& Valiant. A general lower bound
on the number of examples needed for learning. {\em Information and Computation}, {\em 82:3},
247-261, 1989.
\bibitem{Bnb} Elkan, C. {\em Boosting and naive bayesian learning} (Technical Report).
University of California, San Diego, 1997.
\bibitem{Elkan} Elkan, C. The foundations of cost-sensitive learning. {\em Proceedings of the
17th International Joint Conference on Artificial Intelligence}, 973-978, 2001.
\bibitem{Adacost} Fan, W., Stolfo, S., Zhang, J. \& Chan, P.
AdaCost: Misclassification cost-sensitive boosting. {\em Proceedings of the 16th
International Conference on Machine Learning}, 97-105, 1999.
\bibitem{Adaboost} Freund, Y. \& Schapire, R. E. A decision-theoretic generalization of on-line learning and an
application to boosting. {\em Journal of Computer and System Sciences}, {\em 55:1}, 119-139, 1997.
\bibitem{KDD} Hettich, S. \& Bay, S. D.
The UCI KDD Archive. University of California, Irvine. \texttt{[http://kdd.ics.uci.edu/]}.
\bibitem{SVMLight} Joachims, T. Making large-scale SVM learning practical.
In {\em Advances in Kernel Methods - Support Vector Learning}. MIT Press, 1999.
\bibitem{Joachims} Joachims, T.
Estimating the generalization performance of a SVM efficiently. {\em Proceedings of the 17th International
Conference on Machine Learning}, 431-438, 2000.
\bibitem{agnostic} Kearns, M., Schapire, R., \& Sellie, L.
Toward Efficient Agnostic Learning. {\em Machine Learning}, {\em 17}, 115-141, 1998.
\bibitem{SQ} Kearns, M. Efficient noise-tolerant learning from statistical queries.
{\em Journal of the ACM}, {\em 45:6}, 983-1006, 1998.
\bibitem{Margineantu} Margineantu, D. Class probability estimation and cost-sensitive classification decisions.
{\em Proceedings of the 13th European Conference on Machine Learning}, 270-281, 2002.
\bibitem{Qu96} Quinlan, J. R. Boosting, Bagging, and C4.5. {\em Proceedings of the Thirteenth National Conference
on Artificial Intelligence}, 725-730, 1996.
\bibitem{C4.5} Quinlan, J. R. {\em C4.5: Programs for Machine Learning}. San Mateo, CA: Morgan Kaufmann, 1993.
\bibitem{Valiant} Valiant, L. A theory of the learnable.
{\em Communications of the ACM}, {\em 27:11}, 1134-1142, 1984.
\bibitem{VonNeumann} von Neumann, J. Various techniques used in connection with random digits,
{\em National Bureau of Standards}, {\em Applied Mathematics Series}, {\em 12}, 36-38, 1951.
\bibitem{ZaEl01} Zadrozny, B. and Elkan, C. Learning and making decisions when costs
and probabilities are both unknown. {\em Proceedings of the 7th International Conference on
Knowledge Discovery and Data Mining}, 203-213, 2001.
\end{thebibliography}
\end{document}