From 27749502e55ddcb33010db5294e13f0ff2728668 Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Sat, 2 Feb 2008 17:48:24 +0100 Subject: [PATCH] Very raw version of the bit operations. --- biblio.bib | 7 ++ macros.tex | 10 ++- mst.tex | 2 +- notation.tex | 3 +- ram.tex | 210 +++++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 223 insertions(+), 9 deletions(-) diff --git a/biblio.bib b/biblio.bib index 2e95fb0..4bcfdef 100644 --- a/biblio.bib +++ b/biblio.bib @@ -615,3 +615,10 @@ inproceedings{ pettie:minirand, pages={2--9}, year={1995} } + +@article{ brodnik:lsb, + title={{Computation of the least significant set bit}}, + author={Brodnik, A.}, + journal={Proceedings of the 2nd Electrotechnical and Computer Science Conference, Portoroz, Slovenia}, + year={1993} +} diff --git a/macros.tex b/macros.tex index adef15e..1738889 100644 --- a/macros.tex +++ b/macros.tex @@ -35,7 +35,7 @@ \def\qeditem{{\parfillskip=0pt\hfill\rlap{\hskip\rightskip\llap{$\spadesuit$}}\par}} \def\FIXME#1{\>{\bo TODO:} #1} \def\symdiff{\mathbin{\Delta}} -\def\hphantas#1#2{\setbox0=\hbox{#2}\hbox to \wd0{#1\hss}} +\def\rack#1#2{\setbox0=\hbox{#1}\hbox to \wd0{#2}} \def\o#1{\accent23 #1} \def\mst{\mathop{\rm mst}} \def\deg{\mathop{\rm deg}} @@ -48,6 +48,14 @@ \def\1{{\bf 1}} \def\(#1){\left<#1\right>} +\def\shl{\mathbin{<\!<}} +\def\shr{\mathbin{>\!>}} +\def\bop#1{\mathbin{\hbox{\sc #1}}} +\def\band{\bop{and}} +\def\bor{\bop{or}} +\def\bxor{\bop{xor}} +\def\bnot{\bop{not}} + % A reversed version of \ddots with extra space at the top to get good alignment of exponents. \def\rddots{\mathinner{\mkern1mu\raise\p@\vbox{\kern7\p@\hbox{.}}\mkern2mu \raise4\p@\hbox{.}\mkern2mu\raise7\p@\hbox{.}\raise11\p@\hbox{}\mkern1mu}} diff --git a/mst.tex b/mst.tex index 7e5be90..2a562db 100644 --- a/mst.tex +++ b/mst.tex @@ -228,7 +228,7 @@ Most MST algorithms can be described as special cases of the following procedure \:In the beginning, all edges are colored black. \:Apply rules as long as possible: \::Either pick a cut~$C$ such that its lightest edge is not blue \hfil\break and color this edge blue, \cmt{Blue rule} -\::Or pick a cycle~$C$ such that its heaviest edge is not red \hfil\break and color this edge \hphantas{red.}{blue.} \cmt{Red rule} +\::Or pick a cycle~$C$ such that its heaviest edge is not red \hfil\break and color this edge \rack{blue.}{red.} \cmt{Red rule} \algout Minimum spanning tree of~$G$ consisting of edges colored blue. \endalgo diff --git a/notation.tex b/notation.tex index d89ece3..076d1c1 100644 --- a/notation.tex +++ b/notation.tex @@ -42,9 +42,10 @@ \n{$\log^* n$}{the iterated logarithm: $\log^*n := \min\{i: \log^{(i)}n < 1\}$; the inverse of~$2\tower n$} \n{$\beta(m,n)$}{$\beta(m,n) := \min\{i: \log^{(i)}n < m/n \}$ \[itjarthm]} \n{$W$}{word size of the RAM \[wordsize]} -\n{$\(x)$}{number~$x\in{\bb N}$ written in binary \[bintota]} +\n{$\(x)$}{number~$x\in{\bb N}$ written in binary \[bitnota]} \n{$\(x)_b$}{$\(x)$ zero-padded to exactly $b$ bits \[bitnota]} \n{$\sigma^k$}{the string~$\sigma$ repeated $k$~times \[bitnota]} +\n{$\equiv$}{congruence modulo a~given number} } %-------------------------------------------------------------------------------- diff --git a/ram.tex b/ram.tex index 14a4676..97a6020 100644 --- a/ram.tex +++ b/ram.tex @@ -106,8 +106,8 @@ As for the choice of RAM operations, the following three instruction sets are of \itemize\ibull \:\df{Word-RAM} --- allows the ``C-language operators'', i.e., addition, - subtraction, multiplication, division, remainder, bitwise {\sc and,} {\sc or,} exclusive - {\sc or} ({\sc xor}), negation ({\sc not}) and bitwise shifts ($\ll$ and~$\gg$). + subtraction, multiplication, division, remainder, bitwise $\band$, $\bor$, exclusive + $\bor$ ($\bxor$) and negation ($\bnot$), and bitwise shifts ($\shl$ and~$\shr$). \:\df{${\rm AC}^0$-RAM} --- allows all operations from the class ${\rm AC}^0$, i.e., those computable by constant-depth polynomial-size boolean circuits with unlimited fan-in and fan-out. This includes all operations of the Word-RAM except for multiplication, @@ -205,7 +205,7 @@ to by the nodes of the tree. Both direct and indirect accesses to the memory can therefore be done in~$\O(W)$ time. Use standard algorithms for arithmetic on big numbers: $\O(W)$ per operation except for multiplication, division and remainders which take $\O(W^2)$.\foot{We could use more efficient arithmetic -algorithms, but the quadratic bound it good enough for our purposes.} +algorithms, but the quadratic bound is good enough for our purposes.} \qed \FIXME{Add references.} @@ -388,12 +388,210 @@ we will use $x$ and $\(x)$ interchangeably to avoid outbreak of symbols. \defn The \df{bitwise encoding} of a~vector ${\bf x}=(x_0,\ldots,x_{d-1})$ of~$b$-bit numbers -is an~integer $\0\(x_{d-1})_b\0\(x_{d-2})_b\0\ldots\0\(x_0)_b$. +is an~integer $\0\(x_{d-1})_b\0\(x_{d-2})_b\0\ldots\0\(x_0)_b = \sum_i 2^{(b+1)i}\cdot x_i$. \para If we wish for the whole vector to fit in a~single word, we need $(b+1)d\le W$. By using multiple-precision arithmetics, we can encode all vectors satisfying $bd=\O(W)$. -We will now describe how to translate simple vector manipulations to RAM operations on their -codes. +We will now describe how to translate simple vector manipulations to $\O(1)$ RAM operations +on their codes. + +\FIXME{Mention multi-prec. intermediates} +\FIXME{Mention isolation of fields?} +\FIXME{Fix notation and add it to the table} +\FIXME{Masking} +\FIXME{Optimality} + +\newdimen\slotwd +\def\setslot#1{\setbox0=#1\slotwd=\wd0} +\def\slot#1{\hbox to \slotwd{\hfil #1\hfil}} +\def\[#1]{\slot{$#1$}} +\def\9{\rack{\0}{\hss$\cdot$\hss}} + +\def\alik#1{% +\medskip +\halign{\hskip 0.2\hsize\hfil $ ##$&\hbox to 0.6\hsize{${}##$ \hss}\cr +#1} +\medskip +} + +\algn{Operations on vectors with $d$~elements of $b$~bits each} + +\>$\(\alpha)$ --- creates a~vector $(\alpha,\ldots,\alpha)$: + +\alik{\alpha*(\0^b\1)^d \cr} + +\>$\(x)$ --- calculates the sum of the elements of~${\bf x}$, assuming that +it fits in~$b$ bits: + +\alik{x \bmod \1^{b+1} \cr} + +This works because when taken modulo~$\1^{b+1}$, the number $2^{b+1}=\1\0^{b+1}$ +is congruent to~1 and thus $x = \sum_i 2^{(b+1)i}\cdot x_i \equiv \sum_i 1^i\cdot x_i \equiv \sum_i x_i$. +As the result should fit in $b$~bits, the modulo cannot change it. + +If we want to avoid division, we can use double-precision multiplication instead: + +\setslot{\hbox{~$\0x_{d-1}$}} +\def\.{\[]} +\def\z{\[\0^b\1]} +\def\dd{\slot{$\cdots$}} +\def\vd{\slot{$\vdots$}} +\def\rule{\noalign{\medskip\nointerlineskip}$\hrulefill$\cr\noalign{\nointerlineskip\medskip}} + +\alik{ +\[\0x_{d-1}] \dd \[\0x_2] \[\0x_1] \[\0x_0] \cr +*~~ \z \dd \z\z\z \cr +\rule +\[x_{d-1}] \dd \[x_2] \[x_1] \[x_0] \cr +\[x_{d-1}] \[x_{d-2}] \dd \[x_1] \[x_0] \. \cr +\[x_{d-1}] \[x_{d-2}] \[x_{d-3}] \dd \[x_0] \. \. \cr +\vd\vd\vd\vd\.\.\.\cr +\[x_{d-1}] \dd \[x_2]\[x_1]\[x_0] \. \. \. \. \cr +\rule +\[r_{d-1}] \dd \[r_2] \[r_1] \[s_d] \dd \[s_3] \[s_2] \[s_1] \cr +} + +This way, we even get the vector of all partial sums: +$s_k=\sum_{i=0}^{k-1}x_i$, $r_k=\sum_{i=k}^{d-1}x_i$. + +\>$\(x,y)$ --- element-wise comparison of~vectors ${\bf x}$ and~${\bf y}$, +i.e., a~vector ${\bf z}$ such that $z_i=1$ iff $x_i$\(x,\alpha)$ --- return the number of elements of~${\bf x}$ which are less than~$\alpha$, +assuming that the result fits in~$b$ bits: + +\alik{ +\(\(x,\(\alpha))) \cr +} + +\>$\(x,\alpha)$ --- insert~$\alpha$ to a~sorted vector $\bf x$: + +Calculate $k = \(x,\alpha)$ first, then insert~$\alpha$ as the $k$-th +field of~$\bf x$ using masking operations. + +\>$\(\alpha)$ --- create a~vector whose components are the bits of~$\(\alpha)_b$. +In other words, it inserts $\0^b$ between every two bits of~$\alpha$. + +\algo +\:$x\=\(\alpha)$. +\:$y\=(2^{b-1},2^{b-2},\ldots,2^0)$. \cmt{bitwise encoding of this vector} +\:$z\=x \band y$. +\:Return $\(z,y)$. +\endalgo + +Let us observe that $z_i$ is either zero or equal to~$y_i$ depending on the value +of the $i$-th bit of the number~$\alpha$. Comparing it with~$y_i$ normalizes it +to either zero or one. + +\>$\_\varphi(\alpha)$ --- like \, but changes the order of the +bits according to a~fixed permutation~$\varphi$: The $i$-th element of the +resulting vector is equal to the $\pi(i)$-th bit of~$\alpha$. + +Implemented as above, but with mask~$y=(2^{\pi(b-1)},\ldots,2^{\pi(0)})$. + +\>$\(x)$ --- the inverse of \: given a~vector of zeroes and ones, +it produces a~number whose bits are the elements of the vector (in other words, +it crosses out the $\0^b$ blocks). + +We interpret the~$x$ as an~encoding of a~vector with elements one bit shorter +and sum these elements. For example, when $n=4$ and~$b=4$: + +\setslot{\hbox{$x_3$}} +\def\z{\[\0]} +\def\|{\hskip1pt\vrule height 10pt depth 4pt\hskip1pt} +\def\.{\hphantom{\|}} + +\alik{ +\|\z\.\z\.\z\.\z\.\[x_3]\|\z\.\z\.\z\.\z\.\[x_2]\|\z\.\z\.\z\.\z\[x_1]\|\z\.\z\.\z\.\z\.\[x_0]\|\cr +\|\z\.\z\.\z\.\z\|\[x_3]\.\z\.\z\.\z\|\z\.\[x_2]\.\z\.\z\|\z\.\z\[x_1]\.\z\|\z\.\z\.\z\.\[x_0]\|\cr +} + +However, this ``reformatting'' does not produce a~correct encoding of a~vector, +because the separator zeroes are missing. For this reason, the implementation +of~\ by modulo does not work correctly (it produces $\1^b$ instead of $\0^b$). +We use the technique based on multiplication instead, which does not need +the separators. (Alternatively, we can observe that $\1^b$ is the only case +affected, so we can handle it separately.) + +\FIXME{Note constants} + +\para +We can use the above tricks to perform interesting operations on individual +numbers in constant time, too. Let us assume for a~while that we are +operating on $b$-bit numbers and the word size is at least~$b^2$. +This enables us to make use of intermediate vectors with $b$~elements +of $b$~bits each. + +\algn{Integer operations in quadratic workspace} + +\>$\(\alpha)$ --- compute the Hamming weight of~$\alpha$, i.e., the number of ones in~$\(\alpha)$. + +Perform \ and then \. + +\>$\_\pi(\alpha)$ --- shuffle the bits of~$\alpha$ according +to a~fixed permutation~$\pi$. + +Perform $\_\pi$ and \ back. + +\>$\(\alpha)$ --- find the least significant bit of~$\alpha$, +i.e., the minimum~$i$ such that the $i$-th bit of~$\alpha$ is one. + +By a~combination of subtraction with $\bxor$, we create a~number +which contain ones exactly at positions below $\(\alpha)$: + +\alik{ +\alpha&= \9\9\9\9\9\1\0\0\0\0\cr +\alpha-1&= \9\9\9\9\9\0\1\1\1\1\cr +\alpha\bxor(\alpha-1)&= \0\9\9\9\0\1\1\1\1\1\cr +} + +Then calculate the \ of the result. + +\>$\(\alpha)$ --- find the most significant bit (the position +of the highest bit set in~$\alpha$). + +Reverse the bits of the number first by~\, then apply \ +and subtract the result from~$b-1$. + +\rem +As noted by Brodnik~\cite{brodnik:lsb} and others, the space requirements of +the \ operation can be reduced to linear. We split the input to roughly +$\sqrt{b}$ blocks of roughly $\sqrt{b}$ bits each. Then we find which blocks are +non-zero and identify the lowest non-zero block (this is \ of a~number +whose bits correspond to the blocks). Finally we calculate the \ of this +block. The same trick of course works for the \ as well. +The following algorithm shows the details. + +\algn{LSB in linear workspace} + +\algo +\algin A~$w$-bit number~$\alpha$. +\:$b\=\lceil\sqrt{w}\rceil$. \cmt{size of a~block} +\:$x\=(\alpha \band (\0\1^b)^b) \bor (\alpha \band (\1\0^b)^b)$. +\hfill\break +\cmt{$x_i\ne 0$ iff the $i$-th block is non-zero}% +\foot{Why is this so complicated? It is tempting to take $\alpha$ itself as a~code of this vector, +but we unfortunately need the separator bits between elements, so we create them and +relocate the bits we have overwritten.} +\:$y\=\(0,x)$. \cmt{$y_i=1$ if the $i$-th block is non-zero, otherwise $y_0=0$} +\:$\beta\=\(y)$. \cmt{each block compressed to a~single bit} +\:$p\=\(\beta)$. \cmt{the index of the lowest non-zero block} +\:$\gamma\=(\alpha \shr bp) \band \1^b$. \cmt{the contents of that block} +\:$q\=\(\gamma)$. \cmt{the lowest bit set there} +\algout $\(\alpha) = bp+q$. +\endalgo \endpart -- 2.39.5