From 136af12f0e36799f53107c35311faf704c232f9a Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Fri, 1 Feb 2008 15:27:14 +0100
Subject: [PATCH] More RAM bits.

---
 biblio.bib | 19 ++++++++++++++++++-
 ram.tex    | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/biblio.bib b/biblio.bib
index 0ea8521..5f7c413 100644
--- a/biblio.bib
+++ b/biblio.bib
@@ -507,13 +507,22 @@ inproceedings{ pettie:minirand,
 
 @book{ knuth:fundalg,
  author = {Donald E. Knuth},
- title = {The art of computer programming, volume 1 (3rd ed.): fundamental algorithms},
+ title = {The Art of Computer Programming, Volume 1 (3rd ed.): Fundamental Algorithms},
  year = {1997},
  isbn = {0-201-89683-4},
  publisher = {Addison Wesley Longman Publishing Co., Inc.},
  address = {Redwood City, CA, USA},
 }
 
+@book{ knuth:seminalg,
+ author = {Donald E. Knuth},
+ title = {The Art of Computer Programming, Volume 2 (3rd ed.): Seminumerical algorithms},
+ year = {1997},
+ isbn = {978-0-201-89684-8},
+ publisher = {Addison Wesley Longman Publishing Co., Inc.},
+ address = {Redwood City, CA, USA},
+}
+
 @inproceedings{ hagerup:wordram,
  author = {Torben Hagerup},
  title = {{Sorting and Searching on the Word RAM}},
@@ -608,3 +617,11 @@ inproceedings{ pettie:minirand,
   year={2002},
   publisher={IEEE Computer Society Washington, DC, USA}
 }
+
+@article{ ito:newrap,
+  title={{Efficient Initial Approximation and Fast Converging Methods for Division and Square Root}},
+  author={Ito, M. and Takagi, N. and Yajima, S.},
+  journal={Proc. 12th IEEE Symp. Computer Arithmetic},
+  pages={2--9},
+  year={1995}
+}
diff --git a/ram.tex b/ram.tex
index e62452e..fb3bd54 100644
--- a/ram.tex
+++ b/ram.tex
@@ -88,6 +88,7 @@ avoid this behavior:
   On the other hand, we are interested in polynomial-time algorithms only, so $\Theta(\log N)$-bit
   numbers should be sufficient. In practice, we pick~$w$ to be the larger of
   $\Theta(\log N)$ and the size of integers used in the algorithm's input and output.
+  We will call an integer stored in a~single memory cell a~\df{machine word.}
 \endlist
 
 Both restrictions easily avoid the problems of unbounded parallelism. The first
@@ -301,7 +302,7 @@ scanning all~$n$ buckets takes $\O(n+m)$ time.
 \section{Data structures on the RAM}
 
 There is a~lot of data structures designed specifically for the RAM, taking
-advantage of both indexing and arithmetic. In many cases, they surpass the known
+advantage of both indexing and arithmetics. In many cases, they surpass the known
 lower bounds for the same problem on the~PM and often achieve constant time
 per operation, at least when either the magnitude of the values or the size of
 the data structure are suitably bounded.
@@ -337,4 +338,47 @@ and Willard and it will also form a~basis for the rest of this chapter.
 
 \section{Bits and vectors}
 
+In this rather technical section, we will show how RAM can be used as a~vector
+computer to operate in parallel on multiple elements, as long as these elements
+fit in a~single machine word. On the first sight this might seem useless, as we
+cannot require large word sizes, but surprisingly often the elements are small
+enough relative to the size of the algorithm's input and thus also relative to
+the minimum possible word size. Also, as the following lemma shows, we can
+easily emulate constant-times longer words:
+
+\lemman{Multiple-precision calculations}
+Given a~RAM with $W$-bit words, we can emulate all calculation and control
+instructions of a~RAM with word size $kW$ in time depending only on the~$k$.
+(This is usually called \df{multiple-precision arithmetics.})
+
+\proof
+We split each word of the ``big'' machine to $W'=W/2$-bit blocks and store these
+blocks in $2k$ consecutive memory cells. Addition, subtraction, comparison,
+bitwise logical operations can be performed block-by-block. Shifts by a~multiple
+of~$W'$ are trivial, otherwise we can combine each block of the result from
+shifted versions of two original blocks.
+To multiply two numbers, we can use the elementary school algorithm using the $W'$-bit
+blocks as digits in base $2^{W'}$ --- the product of any two blocks fits
+in a~single word.
+
+Division is harder, but Newton-Raphson iteration (see~\cite{ito:newrap})
+converges to the quotient in a~constant number of iterations, each of them
+involving $\O(1)$ multiple-precision additions and multiplications. A~good
+starting approximation can be obtained by dividing the two most-significant
+(non-zero) blocks of both numbers.
+
+Another approach to division is using the improved elementary school algorithm as described
+by Knuth in~\cite{knuth:seminalg}. It uses $\O(k^2)$ steps, but the steps involve
+calculation of the most significant bit set in a~word. We will show below that it
+can be done in constant time without using division.
+\qed
+
+\nota
+We will use boldface letters for vectors. The components of a~vector~${\bf x}$
+will be denoted as $x_1,\ldots,x_n$.
+
+\defn
+The \df{canonical representation} of a~vector $(x_1,\ldots,x_n)$ of~$b$-bit numbers
+is an~integer XXXXXXXXXXXXXXX
+
 \endpart
-- 
2.39.5