% -*- mode: Noweb; noweb-code-mode: icon-mode -*-
\title{htmltoc -- A Table of Contents Generater for HTML}
\author{stephen parker\thanks{Revisions by Norman Ramsey}\\
	 \texttt{stephen@stephen.uk.eu.org}}

\tableofcontents

\section{Introduction}
\texttt{htmltoc} reads an HTML document on standard input and writes the
same document, but optionally including a table of contents on output.
The motivation for this program was Norman Ramsey's \texttt{htmltoc}
program included with the \texttt{noweb} distribution which this program
has no advantages over other than not being written in perl.

% noweave mangles the references in rawhtml blocks if you're not
% careful.  don't ask me why.
\begin{rawhtml}
This html document is created from a noweb
<a href="http://lila.york.ac.uk/htmltoc/htmltoc.nw">source</a>
document which also produces the notangled
<a href="http://lila.york.ac.uk/htmltoc/htmltoc.icn">icon source</a>.
The <a href="http://lila.york.ac.uk/htmltoc/mkfile">mkfile</a> to control
this is also available; if you haven't got mk then it shouldn't be too
hard to turn it into a Makefile.
\end{rawhtml}

\section{Implementation}
The Icon source follows the usual pattern.
<<*>>=
<<Preprocessor constants>>
<<Data structures>>
<<Global Variables>>
<<Procedures>>
@ Any text between the marks [["<tableofcontents>"]] and
[["</tableofcontents>"]] is replaced on output with a table
of contents for the document.
<<Preprocessor constants>>=
$define	sentinel "<tableofcontents>"
$define	sentinel_end "</tableofcontents>"
@ The TOC is created from the header items in the text.  These
are stored in an [[ioc]] record.  The headers are sequentially
numbered and the level of the heading and the heading text are
stored in a list of [[ioc]]s named [[contents]].
<<Data structures>>=
record	ioc(number, level, text)
<<Global Variables>>=
global	contents
@ HTMLtoc must make two passes through the file in order to
generate a TOC before the actual material.  Since we'd like
HTMLtoc to run in a pipeline we actually store the entire
input in a list named [[file]].
<<Procedures>>=
procedure	readfile(f)
	local old_contents
	intoc := &null
	contents := []
        old_contents := []
	no := 0

	file := list()
	while l := read(f) do l ? { <<handle line>> }
        if \intoc then {
           push(old_contents, sentinel_end)
           every l := !copy(old_contents) do l ? { <<handle line>> }
	}

	return file
end
@ The variable [[intoc]] is [[1]] if we are between the TOC markers
on input.  [[no]] is incremented for each header read.
<<handle line>>=
if =sentinel then {
	intoc := "in table of contents"
	put(file, l)
} else if =sentinel_end then {
	intoc := &null
	put(file, l)
} else if \intoc then {
        put(old_contents, l)
} else if /intoc then {
	if <<[[l]] includes a header>> then {
		<<add header line to [[contents]] and [[file]]>>
	} else
		put(file, l)
}
@ We look for a string of the form \verb|<[Hh][0-9]>| which
signifies the beginning of a header.  This isn't nice Icon.
<<[[l]] includes a header>>=
(pre := tab(upto('<')), ="<", skip(), tab(any('hH')), n := tab(many(&digits)), 
 skip(), =">")
@  [[skip]] skips white space, if present.
<<Procedures>>=
procedure skip()
  suspend tab(many(' \t')) | ""
end
@ The scanning environment contains a header, the 
\verb|<[Hh][0-9]>| has already been consumed, so
the text of the header can by found by scanning for the close of header.
The header number,
[[no]] is incremented and then [[addcontents()]] is
called to add the item to the [[contents]] list.
<<add header line to [[contents]] and [[file]]>>=
if (c := tab(upto('<')), ="</", skip(), tab(any('hH')), =n, skip(), =">") then {
  no +:= 1
  c ? c := strip_toc_anchors() || tab(0)
  addcontents(no, n, c)
  <<append modified line to [[file]]>>
} else
  write(&errout, "Unterminated <h", n, ">", tab(0), "...")
@ We also need to label the heading in the text, so
we can reference it from the TOC.  
<<append modified line to [[file]]>>=
put(file, pre || "<h" || n || "><a name=toc" || no || ">" || c ||
          "</a></h" || n || ">" || tab(0))
@ When stripping these, we rely on their having been put there by this
program.
<<Procedures>>=
procedure strip_toc_anchors()
  local s, p
  if (="<a name=toc", tab(many(&digits)), =">", s := strip_toc_anchors(), ="</a>") then
    return s
  else {
    s := to_next_anchor_or_end_anchor() | runerr(1, "never found an anchor")
    if pos(0) | at_end_anchor() then
      return s
    else
      return s || tab_past_anchor_start() || strip_toc_anchors() ||
             tab_past_anchor_end()
  }      
end
@
<<Procedures>>=
procedure at_end_anchor() 
  &subject[&pos:0] ?
    return tab_past_anchor_end()
end

procedure at_start_anchor() 
  &subject[&pos:0] ?
    return tab_past_anchor_start()
end

procedure tab_past_anchor_end()
  suspend ="<" || optwhite() || ="/" || optwhite() || =("a"|"A") || optwhite() || =">"
end

procedure tab_past_anchor_start()
  suspend ="<" || optwhite() || =("a"|"A") || white() || tab(upto('>')) || =">"
end

procedure to_next_anchor_or_end_anchor()
  return (1(tab(upto('<')), at_start_anchor() | at_end_anchor()) | tab(0)) \ 1
end

procedure white()
  suspend tab(many(' \t'))
end
procedure optwhite()
  suspend white() | ""
end
@ [[addcontents()]] should simply append the [[ioc]]
record for the header to the [[contents]] list.  But
one problem is that the text in the header may not
be simple -- it might contain markup.  One case where
we don't want to copy this markup is if it is
a \verb|<a name="|... token, since then there would be
multiple targets for this name.  Any markup of the
form \verb|<| ... \verb|>| is stripped from the header --
this harsh rule may be relaxed in future.
We also avoid adding the line ``Contents'' to the contents.
<<Procedures>>=
procedure addcontents(no, n, s)
	if map(s) ~== "contents" & find(n, toclevels) then
		put(contents, ioc(no, n, strip_tags(s)))
	return
end

procedure strip_tags(s)
  s ? {
    r := ""
    while r ||:= tab(upto('<')) do (tab(upto('>')), =">")
    return r || tab(0)
  }
end
@ 
The procedure [[writetoc()]] writes the TOC on
output using the [[contents]] list.  We keep a current
header level in the integer variable [[level]].  As
we iterate through the [[contents]] list we insert \verb|<ul compact>|s and
\verb|</ul>|s as required for formatting.
<<Procedures>>=
global minlevel

procedure tocindent(level)
  return repl("  ", level-minlevel)
end

procedure	writetoc()
	local level
	minlevel := 99
        every minlevel >:= (!contents).level 
	level := minlevel - 1 # triggers opening list

	every l := !contents do {
		<<adjust [[level]] to [[l.level]]>>
		write(tocindent(level),
		      "<li><a href=\"#toc", l.number, "\">", l.text, "</a></li>")
	}
	<<close any lists>>
end
@ We write out a \verb|</ul>| for each level we are beyond the
current item, and a \verb|<ul compact>| for each level we are below
the current.  The call to [[repl()]] merely writes the correct
number of tabs so as to make the contents look ok.
<<adjust [[level]] to [[l.level]]>>=
while level > l.level do {
	write(tocindent(level), "</ul>")
	level -:= 1
}
while level < l.level do {
	level +:= 1
	write(tocindent(level), "<ul compact>")
}
@ At the end of the contents we need to write as many
\verb|</ul>|s as are required.
<<close any lists>>=
while level >= minlevel do {
	write(tocindent(level), "</ul>")
	level -:= 1
}
@ The [[add_contents()]] procedure simply reads the input using
[[readfile()]] then writes it out again.  If the
sentinel line is found then the TOC is written at
that point.
<<Procedures>>=
procedure add_contents(f)
	file := readfile(f)

	every l := !file do {
	        write(l)
		if match(sentinel, l) then writetoc()
	}
end
@ 
[[main]] makes the program read its arguments or act as a filter.
<<Procedures>>=
global toclevels
procedure main(args)
  if args[1] ? (="-", toclevels := tab(many(&digits)), pos(0)) then get(args)
  else toclevels := "2345"
  if *args = 0 then add_contents(&input)
  else every add_contents(openfile(!args))
  return
end

procedure openfile(f)
  return open(f) | stop("Cannot open file ", f)
end
<<*>>=
procedure rcsinfo () 
  return "$Id: htmltoc.nw,v 1.17 2006/06/12 21:03:54 nr Exp nr $" ||
         "$Name: v2_11b $"
end
@