From 976fa1a1ee96a8f4c97f9d1370d14473c08e33d4 Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Mon, 17 Mar 2008 16:24:42 +0100
Subject: [PATCH] Karger's sampling lemma.

---
 PLAN       |  1 +
 adv.tex    | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 biblio.bib | 22 ++++++++++++++++-
 mst.tex    |  4 ++--
 4 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/PLAN b/PLAN
index 2a6ae25..5554906 100644
--- a/PLAN
+++ b/PLAN
@@ -59,6 +59,7 @@ Spanning trees:
 - bounded expansion classes?
 - restricted cases and arborescences
 - mention parallel algorithms (see remarks in Karger)
+- Pettie's paper on random bits
 
 Models:
 
diff --git a/adv.tex b/adv.tex
index 65d2029..b932458 100644
--- a/adv.tex
+++ b/adv.tex
@@ -1052,6 +1052,75 @@ is still open even for the RAM.
 
 \section{A~randomized algorithm}\id{randmst}%
 
+When we analysed the contractive Bor\o{u}vka's algorithm in Section~\ref{contalg},
+we observed that while the number of vertices per phase decreases exponentially,
+the number of edges can remain asymptotically the same. Karger, Klein and Tarjan
+\cite{karger:randomized} have overcome this problem by combining the Bor\o{u}vka's
+algorithm with filtering based on random sampling. This leads to a~randomized
+algorithm which runs in expected linear time.
+
+The principle of the filtering is simple: Let us consider any spanning tree~$T$
+of the input graph~$G$. Each edge of~$G$ that is $T$-heavy is the heaviest edge
+of some cycle, so by the Red lemma (\ref{redlemma}) it cannot participate in
+the MST of~$G$. We can therefore discard all $T$-heavy edges and continue with
+finding the MST on the reduced graph. Not all choices of~$T$ are equally good,
+but the following lemma shows that when we take~$T$ as a~MST of a~randomly selected
+subgraph, only a~small expected number of edges remain.
+
+\rem
+The random sampling will sometimes select a~disconnected subgraph,
+so we will not assume connectedness in this section. As we already noted
+(Remark \ref{disconn}), with a~little bit of care our algorithms and theorems
+keep working. We also extend the notion of light and heavy edges with respect
+to a~tree to forests: When an~edge~$e$ connects two vertices lying in the same
+tree~$T$ of a~forest~$F$, it is $F$-heavy iff it is $T$-heavy (similarly
+for $F$-light). Edges connecting two different trees are always considered
+$F$-light. Again, a~spanning forest~$F$ is minimum iff there are no $F$-light
+edges.
+
+\lemman{Random sampling, Karger \cite{karger:sampling}}
+Let $H$~be a~subgraph of~$G$ obtained by including each edge independently
+with probability~$p$ and $F$~the minimum spanning forest of~$H$. Then the
+expected number of $F$-nonheavy edges in~$G$ is at most $n/p$.
+
+\proof
+Let us observe that we can obtain the forest~$F$ by combining the Kruskal's algorithm
+(\ref{kruskal}) with the random process producing~$H$ from~$G$. We sort all edges of~$G$
+by their weights and start with an~empty forest~$F$. For each edge, we first
+flip a~biased coin (it gives heads with probability~$p$) and if it comes up
+tails, we discard the edge. Otherwise we perform a~single step of the Kruskal's
+algoritm: We check if~$F+e$ contains a~cycle. If it does, we discard~$e$, otherwise
+we add~$e$ to~$F$. At the end, we have produced the subgraph~$H$ and its MSF~$F$.
+
+We can modify the algorithm by swapping the check for cycles with flipping of
+the coin:
+\algo
+\:If $F+e$ contains a~cycle, we immediately discard~$e$ (we can flip
+the coin, but we need not to, because the edge will be discarded regardless of
+the outcome). We note that~$e$ is $F$-heavy with respect to both the
+current~$F$ and the final MSF.
+\:If $F+e$ is acyclic, we flip the coin:
+\::If it comes up heads, we add~$e$ to~$F$. In this case, $e$~is neither $F$-light
+   nor $F$-heavy.
+\::If it comes up tails, we discard~$e$. Such edges are $F$-light.
+\endalgo
+
+The number of $F$-nonheavy edges is therefore equal to the number of the coin
+flips in step~2 of this algorithm. We also know that the algorithm stops before
+it adds $n$~edges to~$F$. Therefore it flips at most as many coins as a~simple
+random process which repeatedly flips until it gets~$n$ heads. As waiting for
+every occurence of heads takes expected time~$1/p$, waiting for~$n$ heads
+must take $n/p$. This is the bound we wanted to achieve.
+\qed
+
+\rem
+We could also use a~slightly different formulation of the sampling lemma
+suggested by Chan \cite{chan:backward}. It changes the selection of the subgraph~$H$
+to choosing an~$mp$-edge subset of~$E(G)$ uniformly at random. The proof is then
+a~straightforward application of the backward analysis method. We however prefered
+the Karger's original version, because generating  a~random subset of a~given size
+cannot be generally performed in bounded worst-case time.
+
 %--------------------------------------------------------------------------------
 
 \section{Special cases and related problems}
diff --git a/biblio.bib b/biblio.bib
index 10335d4..0e406ad 100644
--- a/biblio.bib
+++ b/biblio.bib
@@ -178,13 +178,33 @@
 
 @article { karger:randomized,
    author = "D. R. Karger and P. N. Klein and R. E. Tarjan",
-   title = "{Linear expected-time algorithms for connectivity problems}",
+   title = "{A Randomized Linear-Time Algorithm to Find Minimum Spanning Trees}",
    journal = jacm,
    volume = "42",
+   number = "2",
    pages = "321--328",
    year = "1995"
 }
 
+@inproceedings { karger:sampling,
+   title={Random sampling in matroids, with applications to graph connectivity and minimum spanning trees},
+   author={Karger, D.R.},
+   booktitle={Proceedings of the 34th Annual Symposium on Foundations of Computer Science},
+   year={1993},
+   pages={84--93}
+}
+
+@article{ chan:backward,
+  title={{Backwards analysis of the Karger-Klein-Tarjan algorithm for minimum spanning trees}},
+  author={Chan, T.M.},
+  journal={Information Processing Letters},
+  volume={67},
+  number={6},
+  pages={303--304},
+  year={1998},
+  publisher={Elsevier}
+}
+
 @article { frederickson:online,
    author = "Greg N. Frederickson",
    title = "{Data structures for on-line updating of minimum spanning trees}",
diff --git a/mst.tex b/mst.tex
index 50e7bd2..b79d0c8 100644
--- a/mst.tex
+++ b/mst.tex
@@ -255,7 +255,7 @@ in~$T_{min}$ joining these vertices must cross~$C$ at least once). Exchanging
 $e$ for $e'$ in $T_{min}$ yields an even lighter spanning tree since
 $w(e)<w(e')$. \qed
 
-\lemman{Red lemma}%
+\lemman{Red lemma}\id{redlemma}%
 When an edge is colored red in any step of the procedure, it is not contained in the minimum spanning tree.
 
 \proof
@@ -670,7 +670,7 @@ to finish on the remaining complete graph. Each iteration runs on a graph with $
 edges as every $H_{a,k}$ contains a complete graph on~$a$ vertices.
 \qed
 
-\remn{Disconnected graphs}
+\remn{Disconnected graphs}\id{disconn}%
 The basic properties of minimum spanning trees and the algorithms presented in
 this chapter apply to minimum spanning forests of disconnected graphs, too.
 The proofs of our theorems and the steps of our algorithms are based on adjacency
-- 
2.39.5