From 27749502e55ddcb33010db5294e13f0ff2728668 Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Sat, 2 Feb 2008 17:48:24 +0100
Subject: [PATCH] Very raw version of the bit operations.

---
 biblio.bib   |   7 ++
 macros.tex   |  10 ++-
 mst.tex      |   2 +-
 notation.tex |   3 +-
 ram.tex      | 210 +++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 223 insertions(+), 9 deletions(-)

diff --git a/biblio.bib b/biblio.bib
index 2e95fb0..4bcfdef 100644
--- a/biblio.bib
+++ b/biblio.bib
@@ -615,3 +615,10 @@ inproceedings{ pettie:minirand,
   pages={2--9},
   year={1995}
 }
+
+@article{ brodnik:lsb,
+  title={{Computation of the least significant set bit}},
+  author={Brodnik, A.},
+  journal={Proceedings of the 2nd Electrotechnical and Computer Science Conference, Portoroz, Slovenia},
+  year={1993}
+}
diff --git a/macros.tex b/macros.tex
index adef15e..1738889 100644
--- a/macros.tex
+++ b/macros.tex
@@ -35,7 +35,7 @@
 \def\qeditem{{\parfillskip=0pt\hfill\rlap{\hskip\rightskip\llap{$\spadesuit$}}\par}}
 \def\FIXME#1{\>{\bo TODO:} #1}
 \def\symdiff{\mathbin{\Delta}}
-\def\hphantas#1#2{\setbox0=\hbox{#2}\hbox to \wd0{#1\hss}}
+\def\rack#1#2{\setbox0=\hbox{#1}\hbox to \wd0{#2}}
 \def\o#1{\accent23 #1}
 \def\mst{\mathop{\rm mst}}
 \def\deg{\mathop{\rm deg}}
@@ -48,6 +48,14 @@
 \def\1{{\bf 1}}
 \def\(#1){\left<#1\right>}
 
+\def\shl{\mathbin{<\!<}}
+\def\shr{\mathbin{>\!>}}
+\def\bop#1{\mathbin{\hbox{\sc #1}}}
+\def\band{\bop{and}}
+\def\bor{\bop{or}}
+\def\bxor{\bop{xor}}
+\def\bnot{\bop{not}}
+
 % A reversed version of \ddots with extra space at the top to get good alignment of exponents.
 \def\rddots{\mathinner{\mkern1mu\raise\p@\vbox{\kern7\p@\hbox{.}}\mkern2mu
     \raise4\p@\hbox{.}\mkern2mu\raise7\p@\hbox{.}\raise11\p@\hbox{}\mkern1mu}}
diff --git a/mst.tex b/mst.tex
index 7e5be90..2a562db 100644
--- a/mst.tex
+++ b/mst.tex
@@ -228,7 +228,7 @@ Most MST algorithms can be described as special cases of the following procedure
 \:In the beginning, all edges are colored black.
 \:Apply rules as long as possible:
 \::Either pick a cut~$C$ such that its lightest edge is not blue \hfil\break and color this edge blue, \cmt{Blue rule}
-\::Or pick a cycle~$C$ such that its heaviest edge is not red \hfil\break and color this edge \hphantas{red.}{blue.} \cmt{Red rule}
+\::Or pick a cycle~$C$ such that its heaviest edge is not red \hfil\break and color this edge \rack{blue.}{red.} \cmt{Red rule}
 \algout Minimum spanning tree of~$G$ consisting of edges colored blue.
 \endalgo
 
diff --git a/notation.tex b/notation.tex
index d89ece3..076d1c1 100644
--- a/notation.tex
+++ b/notation.tex
@@ -42,9 +42,10 @@
 \n{$\log^* n$}{the iterated logarithm: $\log^*n := \min\{i: \log^{(i)}n < 1\}$; the inverse of~$2\tower n$}
 \n{$\beta(m,n)$}{$\beta(m,n) := \min\{i: \log^{(i)}n < m/n \}$ \[itjarthm]}
 \n{$W$}{word size of the RAM \[wordsize]}
-\n{$\(x)$}{number~$x\in{\bb N}$ written in binary \[bintota]}
+\n{$\(x)$}{number~$x\in{\bb N}$ written in binary \[bitnota]}
 \n{$\(x)_b$}{$\(x)$ zero-padded to exactly $b$ bits \[bitnota]}
 \n{$\sigma^k$}{the string~$\sigma$ repeated $k$~times \[bitnota]}
+\n{$\equiv$}{congruence modulo a~given number}
 }
 
 %--------------------------------------------------------------------------------
diff --git a/ram.tex b/ram.tex
index 14a4676..97a6020 100644
--- a/ram.tex
+++ b/ram.tex
@@ -106,8 +106,8 @@ As for the choice of RAM operations, the following three instruction sets are of
 
 \itemize\ibull
 \:\df{Word-RAM} --- allows the ``C-language operators'', i.e., addition,
-  subtraction, multiplication, division, remainder, bitwise {\sc and,} {\sc or,} exclusive
-  {\sc or} ({\sc xor}), negation ({\sc not}) and bitwise shifts ($\ll$ and~$\gg$).
+  subtraction, multiplication, division, remainder, bitwise $\band$, $\bor$, exclusive
+  $\bor$ ($\bxor$) and negation ($\bnot$), and bitwise shifts ($\shl$ and~$\shr$).
 \:\df{${\rm AC}^0$-RAM} --- allows all operations from the class ${\rm AC}^0$, i.e.,
   those computable by constant-depth polynomial-size boolean circuits with unlimited
   fan-in and fan-out. This includes all operations of the Word-RAM except for multiplication,
@@ -205,7 +205,7 @@ to by the nodes of the tree. Both direct and indirect accesses to the memory
 can therefore be done in~$\O(W)$ time. Use standard algorithms for arithmetic
 on big numbers: $\O(W)$ per operation except for multiplication, division and
 remainders which take $\O(W^2)$.\foot{We could use more efficient arithmetic
-algorithms, but the quadratic bound it good enough for our purposes.}
+algorithms, but the quadratic bound is good enough for our purposes.}
 \qed
 
 \FIXME{Add references.}
@@ -388,12 +388,210 @@ we will use $x$ and $\(x)$ interchangeably to avoid outbreak of symbols.
 
 \defn
 The \df{bitwise encoding} of a~vector ${\bf x}=(x_0,\ldots,x_{d-1})$ of~$b$-bit numbers
-is an~integer $\0\(x_{d-1})_b\0\(x_{d-2})_b\0\ldots\0\(x_0)_b$.
+is an~integer $\0\(x_{d-1})_b\0\(x_{d-2})_b\0\ldots\0\(x_0)_b = \sum_i 2^{(b+1)i}\cdot x_i$.
 
 \para
 If we wish for the whole vector to fit in a~single word, we need $(b+1)d\le W$.
 By using multiple-precision arithmetics, we can encode all vectors satisfying $bd=\O(W)$.
-We will now describe how to translate simple vector manipulations to RAM operations on their
-codes.
+We will now describe how to translate simple vector manipulations to $\O(1)$ RAM operations
+on their codes.
+
+\FIXME{Mention multi-prec. intermediates}
+\FIXME{Mention isolation of fields?}
+\FIXME{Fix notation and add it to the table}
+\FIXME{Masking}
+\FIXME{Optimality}
+
+\newdimen\slotwd
+\def\setslot#1{\setbox0=#1\slotwd=\wd0}
+\def\slot#1{\hbox to \slotwd{\hfil #1\hfil}}
+\def\[#1]{\slot{$#1$}}
+\def\9{\rack{\0}{\hss$\cdot$\hss}}
+
+\def\alik#1{%
+\medskip
+\halign{\hskip 0.2\hsize\hfil $ ##$&\hbox to 0.6\hsize{${}##$ \hss}\cr
+#1}
+\medskip
+}
+
+\algn{Operations on vectors with $d$~elements of $b$~bits each}
+
+\>$\<Replicate>(\alpha)$ --- creates a~vector $(\alpha,\ldots,\alpha)$:
+
+\alik{\alpha*(\0^b\1)^d \cr}
+
+\>$\<Sum>(x)$ --- calculates the sum of the elements of~${\bf x}$, assuming that
+it fits in~$b$ bits:
+
+\alik{x \bmod \1^{b+1} \cr}
+
+This works because when taken modulo~$\1^{b+1}$, the number $2^{b+1}=\1\0^{b+1}$
+is congruent to~1 and thus $x = \sum_i 2^{(b+1)i}\cdot x_i \equiv \sum_i 1^i\cdot x_i \equiv \sum_i x_i$.
+As the result should fit in $b$~bits, the modulo cannot change it.
+
+If we want to avoid division, we can use double-precision multiplication instead:
+
+\setslot{\hbox{~$\0x_{d-1}$}}
+\def\.{\[]}
+\def\z{\[\0^b\1]}
+\def\dd{\slot{$\cdots$}}
+\def\vd{\slot{$\vdots$}}
+\def\rule{\noalign{\medskip\nointerlineskip}$\hrulefill$\cr\noalign{\nointerlineskip\medskip}}
+
+\alik{
+\[\0x_{d-1}] \dd \[\0x_2] \[\0x_1] \[\0x_0] \cr
+*~~ \z \dd \z\z\z \cr
+\rule
+\[x_{d-1}] \dd \[x_2] \[x_1] \[x_0] \cr
+\[x_{d-1}] \[x_{d-2}] \dd \[x_1] \[x_0] \. \cr
+\[x_{d-1}] \[x_{d-2}] \[x_{d-3}] \dd \[x_0] \. \. \cr
+\vd\vd\vd\vd\.\.\.\cr
+\[x_{d-1}] \dd \[x_2]\[x_1]\[x_0] \. \. \. \. \cr
+\rule
+\[r_{d-1}] \dd \[r_2] \[r_1] \[s_d] \dd \[s_3] \[s_2] \[s_1] \cr
+}
+
+This way, we even get the vector of all partial sums:
+$s_k=\sum_{i=0}^{k-1}x_i$, $r_k=\sum_{i=k}^{d-1}x_i$.
+
+\>$\<Cmp>(x,y)$ --- element-wise comparison of~vectors ${\bf x}$ and~${\bf y}$,
+i.e., a~vector ${\bf z}$ such that $z_i=1$ iff $x_i<y_i$:
+
+\setslot{\vbox{\hbox{~$x_{d-1}$}\hbox{~$y_{d-1}$}}}
+\alik{
+   \1 \[x_{d-1}] \1 \[x_{d-2}] \[\cdots] \1 \[x_1] \1 \[x_0]  \cr
+-~ \0 \[y_{d-1}] \0 \[y_{d-2}] \[\cdots] \0 \[y_1] \0 \[y_0]  \cr
+}
+
+We replace the separator zeroes in~$x$ by ones (this is just an~$\bor$ with
+a~suitable constant) and subtract~$y$. These ones change back to zeroes exactly at
+the positions where $x_i<y_i$ and they stop carries from propagating, so the
+fields do not interact with each other. It only remains to shift the separator
+bits to the right positions, negate them and zero out all other bits.
+
+\>$\<Rank>(x,\alpha)$ --- return the number of elements of~${\bf x}$ which are less than~$\alpha$,
+assuming that the result fits in~$b$ bits:
+
+\alik{
+\<Sum>(\<Cmp>(x,\<Replicate>(\alpha))) \cr
+}
+
+\>$\<Insert>(x,\alpha)$ --- insert~$\alpha$ to a~sorted vector $\bf x$:
+
+Calculate $k = \<Rank>(x,\alpha)$ first, then insert~$\alpha$ as the $k$-th
+field of~$\bf x$ using masking operations.
+
+\>$\<Unpack>(\alpha)$ --- create a~vector whose components are the bits of~$\(\alpha)_b$.
+In other words, it inserts $\0^b$ between every two bits of~$\alpha$.
+
+\algo
+\:$x\=\<Replicate>(\alpha)$.
+\:$y\=(2^{b-1},2^{b-2},\ldots,2^0)$. \cmt{bitwise encoding of this vector}
+\:$z\=x \band y$.
+\:Return $\<Cmp>(z,y)$.
+\endalgo
+
+Let us observe that $z_i$ is either zero or equal to~$y_i$ depending on the value
+of the $i$-th bit of the number~$\alpha$. Comparing it with~$y_i$ normalizes it
+to either zero or one.
+
+\>$\<Unpack>_\varphi(\alpha)$ --- like \<Unpack>, but changes the order of the
+bits according to a~fixed permutation~$\varphi$: The $i$-th element of the
+resulting vector is equal to the $\pi(i)$-th bit of~$\alpha$.
+
+Implemented as above, but with mask~$y=(2^{\pi(b-1)},\ldots,2^{\pi(0)})$.
+
+\>$\<Pack>(x)$ --- the inverse of \<Unpack>: given a~vector of zeroes and ones,
+it produces a~number whose bits are the elements of the vector (in other words,
+it crosses out the $\0^b$ blocks).
+
+We interpret the~$x$ as an~encoding of a~vector with elements one bit shorter
+and sum these elements. For example, when $n=4$ and~$b=4$:
+
+\setslot{\hbox{$x_3$}}
+\def\z{\[\0]}
+\def\|{\hskip1pt\vrule height 10pt depth 4pt\hskip1pt}
+\def\.{\hphantom{\|}}
+
+\alik{
+\|\z\.\z\.\z\.\z\.\[x_3]\|\z\.\z\.\z\.\z\.\[x_2]\|\z\.\z\.\z\.\z\[x_1]\|\z\.\z\.\z\.\z\.\[x_0]\|\cr
+\|\z\.\z\.\z\.\z\|\[x_3]\.\z\.\z\.\z\|\z\.\[x_2]\.\z\.\z\|\z\.\z\[x_1]\.\z\|\z\.\z\.\z\.\[x_0]\|\cr
+}
+
+However, this ``reformatting'' does not produce a~correct encoding of a~vector,
+because the separator zeroes are missing. For this reason, the implementation
+of~\<Sum> by modulo does not work correctly (it produces $\1^b$ instead of $\0^b$).
+We use the technique based on multiplication instead, which does not need
+the separators. (Alternatively, we can observe that $\1^b$ is the only case
+affected, so we can handle it separately.)
+
+\FIXME{Note constants}
+
+\para
+We can use the above tricks to perform interesting operations on individual
+numbers in constant time, too. Let us assume for a~while that we are
+operating on $b$-bit numbers and the word size is at least~$b^2$.
+This enables us to make use of intermediate vectors with $b$~elements
+of $b$~bits each.
+
+\algn{Integer operations in quadratic workspace}
+
+\>$\<Weight>(\alpha)$ --- compute the Hamming weight of~$\alpha$, i.e., the number of ones in~$\(\alpha)$.
+
+Perform \<Unpack> and then \<Sum>.
+
+\>$\<Permute>_\pi(\alpha)$ --- shuffle the bits of~$\alpha$ according
+to a~fixed permutation~$\pi$.
+
+Perform $\<Unpack>_\pi$ and \<Pack> back.
+
+\>$\<LSB>(\alpha)$ --- find the least significant bit of~$\alpha$,
+i.e., the minimum~$i$ such that the $i$-th bit of~$\alpha$ is one.
+
+By a~combination of subtraction with $\bxor$, we create a~number
+which contain ones exactly at positions below $\<LSB>(\alpha)$:
+
+\alik{
+\alpha&=			\9\9\9\9\9\1\0\0\0\0\cr
+\alpha-1&=			\9\9\9\9\9\0\1\1\1\1\cr
+\alpha\bxor(\alpha-1)&=		\0\9\9\9\0\1\1\1\1\1\cr
+}
+
+Then calculate the \<Weight> of the result.
+
+\>$\<MSB>(\alpha)$ --- find the most significant bit (the position
+of the highest bit set in~$\alpha$).
+
+Reverse the bits of the number first by~\<Permute>, then apply \<LSB>
+and subtract the result from~$b-1$.
+
+\rem
+As noted by Brodnik~\cite{brodnik:lsb} and others, the space requirements of
+the \<LSB> operation can be reduced to linear. We split the input to roughly
+$\sqrt{b}$ blocks of roughly $\sqrt{b}$ bits each. Then we find which blocks are
+non-zero and identify the lowest non-zero block (this is \<LSB> of a~number
+whose bits correspond to the blocks). Finally we calculate the \<LSB> of this
+block. The same trick of course works for the \<MSB> as well.
+The following algorithm shows the details.
+
+\algn{LSB in linear workspace}
+
+\algo
+\algin A~$w$-bit number~$\alpha$.
+\:$b\=\lceil\sqrt{w}\rceil$. \cmt{size of a~block}
+\:$x\=(\alpha \band (\0\1^b)^b) \bor (\alpha \band (\1\0^b)^b)$.
+\hfill\break
+\cmt{$x_i\ne 0$ iff the $i$-th block is non-zero}%
+\foot{Why is this so complicated? It is tempting to take $\alpha$ itself as a~code of this vector,
+but we unfortunately need the separator bits between elements, so we create them and
+relocate the bits we have overwritten.}
+\:$y\=\<Cmp>(0,x)$. \cmt{$y_i=1$ if the $i$-th block is non-zero, otherwise $y_0=0$}
+\:$\beta\=\<Pack>(y)$. \cmt{each block compressed to a~single bit}
+\:$p\=\<LSB>(\beta)$. \cmt{the index of the lowest non-zero block}
+\:$\gamma\=(\alpha \shr bp) \band \1^b$. \cmt{the contents of that block}
+\:$q\=\<LSB>(\gamma)$. \cmt{the lowest bit set there}
+\algout $\<LSB>(\alpha) = bp+q$.
+\endalgo
 
 \endpart
-- 
2.39.5