<?php

require_once('common.php');

/** @file gs-manual-export.php
 *  This script transforms the single XML manual file required by the rest of
 *  the Greenstone manual generation scripts into the series of dokuwiki pages
 *  that make up a certain manual (as specified by the 'm' argument) in a
 *  certain language ('l').
 */

if (!parseCLIArguments())
 {
   printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user|install|develop|paper]");
 }

// 0. Initialization
if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
{
  $_REQUEST['l'] = 'en';
}
if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
{
  $_REQUEST['m'] = 'user';
  //$_REQUEST['m'] = 'install';
  //$_REQUEST['m'] = 'develop';
  //$_REQUEST['m'] = 'paper';
}

// - validate arguments before we use them (security)
if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
 {
   printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
 }

if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
 {
   printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
 }

echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
echo "<p><b>Frontmatter: </b><br/>\n";
// 1. By-and-large we're going to process all of this in a big state machine
// - the top level page, containing cover page and chapter order information,
//   needs to be created last, so we have to store it's information
$manual_metadata = array();
$entity_replacements = array();
$footnotes = array();
$page_order = array();
$page_count = 2;
$looking_for_metadata = '';
$chapter_txt_out = false;
$frontmatter_text = '';
$in_section = false;
$sections_page_name = '';
$in_chapter = false;
$chapter_id = '';
$bullet_depth = 0;
$is_numbered_list = true;
$line_counter = 0;
$in_code = false;
$in_footnotes = false;
$in_numbered_item = 0;
$in_bullet_item = false;
$seen_code_in_item = false;
$in_indent = false;
$is_code_linenumbered = false;
// - construct the path using the information we've been provided as arguments
$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
$xml_in = fopen($xml_file_path, 'r');
if (!$xml_in)
 {
   printError('Failed to locate top level page for manual');
 }
// - we also use this opportunity to read in any footnotes as we'll need to
//   move them onto their appropriate page
while (($line = fgets($xml_in)) !== false)
 {
   if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
   {
     $footnote_id = $matches[1];
     $text_line = fgets($xml_in);
     if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
     {
       $footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
     }
     // - throw away </Footnote>
     fgets($xml_in);
   }
 }
fclose($xml_in);
// - now reopen to parse it
$xml_in = fopen($xml_file_path, 'r');
if (!$xml_in)
 {
   printError('Failed to locate top level page for manual');
 }
while (($line = getLine($xml_in)) !== false)
 {
   // - Special Case: lingering code blocks, continue if next line also
   //   contains code, otherwise we need an extra newline
   if ($in_code)
   {
     $code_text = "\n";
     if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
     {
       $code_text .= '</code>';
       // - codeblocks that appear inside numbered lists do not get their own
       //   newlines as that would split the item. Instead newlines will be
       //   added when </NumberedItem> encountered.
       if (!$is_numbered_list)
       {
         $code_text .= "\n\n";
       }
       else
       {
         $seen_code_in_item = true;
       }
       $in_code = false;
     }
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $code_text);
     }
     else
     {
       $frontmatter_text .= $code_text;
     }
   }
   // - some system metadata to watch for
   if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
   {
     $entity = $matches[1];
     addMetadata('ENTITY',$entity);
     if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
     {
       $entity_replacements[$matches[1]] = $matches[2];
     }
   }
   // - we have an explicit list of cover metadata to watch for
   elseif (!$in_section && !$in_chapter && preg_match('/<(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
   {
     $looking_for_metadata = $matches[1];
   }
   elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
   {
     $looking_for_metadata = '';
   }
   // - found metadata we have!
   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
   {
     $text_id = $matches[1];
     $text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
     addMetadata($looking_for_metadata, $text);
   }
   // - bogus metadata found in French version
   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)"\/>/', $line, $matches))
   {
   }
   // - any text we encounter outside of both sections and chapters also
   //   belongs on the cover
   elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
   {
     // (for now I'll assume id's are persistent)
     addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
   }
   // - we will probably encounter the opening section (which is outside of a
   //   chapter) first, so we have a special case for it
   elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
   {
     $section_id = $matches[1];
     // - if this is the first non-chapter section we have encountered then it
     //   gets the honor of having the page---that these sections will
     //   eventually be printed out on---named after it. Typically this should
     //   be "about_this_manual"
     if (empty($frontmatter_text))
     {
       $sections_page_name = $section_id;
     }
     $in_section = true;
     $title = getTitle($xml_in, 'section:' . $section_id);
     if (empty($frontmatter_text))
     {
       array_unshift($page_order, $sections_page_name . '|' . noComments($title));
     }
     // - if the title, as is, wouldn't autogenerate the appropriate id, then
     //   we have to include the id explicitly (as another html comment block)
     if ($section_id != generateID($title))
     {
       $title = '<!-- sid:' . $section_id . ' -->' . $title;
       $seen_ids[$section_id] = 1;
     }
     $frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
     // - whew. Chapter's going to be just as bad though.
   }
   elseif ($in_section && preg_match('/<\/Section>/', $line))
   {
     $in_section = false;
   }
   elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
   {
     $chapter_id = $matches[1];
     echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
     $chapter_page_name = $chapter_id;
     // - create a new file to store this chapter
     $chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
     if (!file_exists($chapter_file_dir))
     {
       mkAllDir($chapter_file_dir, 0755);
     }
     $chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
     // - backup existing file
     if (file_exists($chapter_file_path))
     {
       $chapter_backup_file_path = $chapter_file_path . '.bak';
       rename($chapter_file_path, $chapter_backup_file_path);
     }
     // - open new file for writing
     $chapter_txt_out = fopen($chapter_file_path, 'w');
     if (!$chapter_txt_out)
     {
       printError('Failed to open page file for writing: ' . $chapter_page_name);
     }
     $in_chapter = true;
     $title = getTitle($xml_in, 'chapter: ' . $chapter_id);
     fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
     array_push($page_order, $chapter_page_name . '|' . noComments($title));
   }
   elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
   {
     fclose($chapter_txt_out);
     $chapter_txt_out = false;
     $in_chapter = false;
     $page_count++;
   }
   // - section, subsection and part titles within chapter
   elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]*)">/', $line, $matches))
   {
     $title_type = $matches[1];
     $section_id = $matches[2];
     if (empty($section_id))
     {
       $section_id = generateID(strtolower($title_type));
     }
     echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
     $header_fix = '';
     $title = getTitle($xml_in, 'heading: ' . $title_type);
     if ($title_type == 'Section')
     {
       $header_fix = '=====';
     }
     if ($title_type == 'Subsection')
     {
       $header_fix = '====';
     }
     if ($title_type == 'Part')
     {
       $header_fix = '===';
       // - remove b's and i's
       $title = preg_replace('/<\/?(B|I)>/i', '', $title);
     }
     // - if the title, as is, wouldn't autogenerate the appropriate id, then
     //   we have to include the id explicitly (as another html comment block)
     if ($section_id != generateID($title))
     {
       $title = '<!-- sid:' . $section_id . ' -->' . $title;
       $seen_ids[$section_id] = 1;
     }
     fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
   }
   elseif ($in_chapter && (strpos($line, '</Section') !== false || strpos($line, '</Subsection') !== false  || strpos($line, '</Part') !== false))
   {
     // do nothing for now
   }
   // - figures (and their titles/captions)
   elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
   {
     $figure_id = $matches[1];
     $other_attributes = $matches[2];
     echo '[adding figure: ' . $figure_id . "] \n";
     // We need the title too
     $caption = getTitle($xml_in, 'figure:' . $figure_id);
     $caption = translateText(alternateComments($caption));
     $txt = "<imgcaption figure_" . $figure_id . '|' . $caption . ' ';
     // - we also check the other attributes to see if the XML has requested
     //   any following codeblock be linenumbered
     if (strpos($other_attributes, 'withLineNumber') !== false)
     {
       $is_code_linenumbered = true;
       $txt .= '%!-- withLineNumber --%';
     }
     $txt .= '>';
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $txt);
     }
     else
     {
       $frontmatter_text .= $txt;
     }
   }
   elseif (strpos($line, '</Figure>') !== false)
   {
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, "</imgcaption>\n\n");
     }
     else
     {
       $frontmatter_text .= "</imgcaption>\n\n";
     }
     // - no longer required
     $is_code_linenumbered = false;
   }
   elseif (preg_match('/<Table([^>]*).*?.*?>/', $line, $matches))
   {
     $attributes = $matches[1];
     $table_txt = '';
     $table_id = '';
     if (preg_match('/id="([^"]+)"/', $attributes, $matches))
     {
       $table_id = $matches[1];
     }
     else
     {
       $table_id = generateID('table');
     }
     $hidden = false;
     if (strpos($attributes, 'class="hidden"') !== false)
     {
       $hidden = true;
     }
     echo '[adding table: ' . $table_id . "] \n";
     $table_caption = getTitle($xml_in, 'table: ' . $table_id);
     if ($hidden)
     {
       $table_txt .= '<tblcaption table_' . $table_id . '|##HIDDEN##></tblcaption>' . "\n";
     }
     elseif (empty($table_caption))
     {
       $table_txt .= '<tblcaption table_' . $table_id . '|##NOCAPTION##></tblcaption>' . "\n";
     }
     else
     {
       $table_txt .= '<tblcaption table_' . $table_id . '|' . noComments($table_caption) . '></tblcaption>' . "\n";
     }
     // - in order to properly capture the table we're going to have to read in
     //   the whole thing here, and take note of column widths
     $have_output_widths = false;
     $column_widths = array();
     while (strpos($line, '</Table>') === false)
     {
       // - find the start of a row
       while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
       {
         $line = getLine($xml_in);
       }
       if (strpos($line, '<tr>') !== false)
       {
         $row_txt = '|';
         $line = getLine($xml_in);
         // - now we read in multiple cells (line starting <th
         while (strpos($line, '<th') === 0)
         {
           if (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
           {
             $cell_width = $matches[1];
             if (!$have_output_widths)
             {
               array_push($column_widths, $cell_width);
             }
           }
           // Ignore empty cells
           // - adding another case for empty header cells (turned up in es
           //   version of "From Paper")
           if (preg_match('/<th width="\d+"\/>/', $line) || preg_match('/<th width="\d+">.*<\/th>/', $line))
           {
             $row_txt .= ' |';
           }
           else
           {
             $line = getLine($xml_in);
             $first = true;
             while (strpos($line, '</th>') === false)
             {
               if (!$first)
               {
                 $row_txt .= '\\\\';
               }
               // - we can have images or text in our tables
               if (preg_match('/<File.*url="images\/([^"]+)".*\/>/', $line, $matches))
               {
                 $payload = $matches[0];
                 $filename = $matches[1];
                 $width = 0;
                 if (preg_match('/width="(\d+)"/', $payload, $matches))
                 {
                   $width = $matches[1];
                 }
                 $height = 0;
                 if (preg_match('/height="(\d+)"/', $payload, $matches))
                 {
                   $height = $matches[1];
                 }
                 $image_txt = handleImage($filename, $width, $height);
                 $row_txt .= ' ' . $image_txt . ' ';
               }
               elseif (preg_match('/<Text id="([^"]+)">(.*)/', $line, $matches))
               {
                 $tid = $matches[1];
                 $txt = $matches[2];
                 // - multiple line text block
                 while (strpos($txt, '</Text>') === false)
                 {
                   $txt .= getLine($xml_in);
                 }
                 $txt = str_replace('</Text>','',$txt);
                 $row_txt .= ' <!-- id:' . $tid . ' -->' . translateText($txt) . ' ';
               }
               elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
               {
                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
               }
               elseif (preg_match('/<CodeLine>(.*)/',$line,$matches))
               {
                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
               }
               elseif (preg_match('/(.*)<\/CodeLine>/',$line,$matches))
               {
                 if (!empty($matches[1]))
                 {
                   $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]). '\'\' ';
                 }
                 else
                 {
                   $row_txt .= ' ';
                 }
               }
               // we'll add (bogus) linebreaks
               elseif (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
               {
                 $row_txt = ' ';
               }
               else
               {
                 printError('Warning! Unrecognized element in table: ' . htmlspecialchars($line));
               }
               $first = false;
               // - next line
               $line = getLine($xml_in);
             }
             // - close the cell
             $row_txt .= '|';
           }
           // next!
           $line = getLine($xml_in);
         }
         // - if we haven't already, output the width command
         if (!$have_output_widths)
         {
           $table_txt .= '|< - ' . implode(' ', $column_widths) . ' >|' . "\n";
           $have_output_widths = true;
         }
         $table_txt .= $row_txt . "\n";
         // - throw away the closing </tr>
         $line = getLine($xml_in);
       }
     }
     $table_txt .= "\n";
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $table_txt);
     }
     else
     {
       $frontmatter_text .= $table_txt;
     }
   }
   // - copy and insert images
   elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
   {
     $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $image_txt);
     }
     else
     {
       $frontmatter_text .= $image_txt;
     }
   }
   // - bullet lists
   elseif (preg_match('/<BulletList>/', $line))
   {
     echo "[adding bulletlist] \n";
     if ($in_bullet_item || $in_numbered_item)
     {
       if ($in_chapter)
       {
         fwrite($chapter_txt_out, "\n");
       }
       elseif ($in_section)
       {
         $frontmatter_text .= "\n";
       }
     }
     $bullet_depth++;
     $is_numbered_list = false;
   }
   // - numbered lists
   elseif (preg_match('/<NumberedList>/', $line))
   {
     echo "[adding numbered list] \n";
     $bullet_depth++;
     $is_numbered_list = true;
     // - reset this flag that keeps track of whether an item (numbered or
     //   otherwise) is legitimately split by a code block
     $seen_code_in_item = false;
   }
   elseif (preg_match('/<\/BulletList>/', $line))
   {
     $bullet_depth--;
     if ($bullet_depth == 0)
     {
       if ($in_chapter)
       {
         fwrite($chapter_txt_out, "\n");
       }
       elseif ($in_section)
       {
         $frontmatter_text .= "\n";
       }
       $is_numbered_list = false;
     }
     ///cho "[finished bulletlist] ";
   }
   elseif (preg_match('/<\/NumberedList>/', $line))
   {
     $bullet_depth--;
     if ($bullet_depth == 0)
     {
       if ($in_chapter)
       {
         fwrite($chapter_txt_out, "\n");
       }
       elseif ($in_section)
       {
         $frontmatter_text .= "\n";
       }
       $is_numbered_list = false;
     }
     ///cho "[finished numbered list] ";
   }
   elseif (preg_match('/<NumberedItem>/', $line))
   {
     $in_numbered_item = 1;
   }
   elseif (preg_match('/<\/NumberedItem>/', $line))
   {
     $in_numbered_item = 0;
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, "\n");
     }
     else
     {
       $frontmatter_text .= "\n";
     }
   }
   elseif (preg_match('/<Bullet>/', $line))
   {
     $in_bullet_item = true;
   }
   elseif (preg_match('/<\/Bullet>/', $line))
   {
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, "\n");
     }
     else
     {
       $frontmatter_text .= "\n";
     }
     $in_bullet_item = false;
   }
   // TEXT HANDLING - this is the main case, but has disappeared into the mire
   // of other cases.
   elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
   {
     $id = $matches[1];
     $str = $matches[2];
     // - special case for those text elements split over multiple lines. We
     //   keep concatenating lines until we find the closing text element or we
     //   run out of lines!
     $another_line = '';
     while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
     {
       $str .= ' ' . $another_line;
     }
     // - note that if we ran out of lines (eof) then we'll break out of this
     //   block anyway, it's just there won't be a <\Text> at the end of this
     //   block... despite this being a major validation issue in the XML it
     //   shouldn't result in this script being vladed
     // - now remove the </Text> from the end (hopefully) of str
     $str = preg_replace('/<\/Text>\s*/', '', $str);
     // - and prepend the id while translating the str into Dokuwiki format
     $str = '<!-- id:' . $id . ' -->' . translateText($str);
     if ($bullet_depth > 0)
     {
       if ($is_numbered_list)
       {
         // - special case for those text elements legimately split in two by
         //   code blocks. They get no bullet of either type and are 'run-on'
         //   immediately to the end of the code element in order to prevent
         //   dokuwiki restarting numbering etc
         if ($seen_code_in_item)
         {
           // - leave str as it is
           // - reset flag just incase the item happens to contain another
           //   code block
           $seen_code_in_item = false;
         }
         else if ($in_numbered_item == 1)
         {
           $str = '- ' . $str;
         }
         // - superspecial case for the poorly formatted numberlists that
         //   contain more than one text block per point. We'll nest them
         //   as a bullet list as that preserves order, formatting and (I
         //   hope) meaning.
         else
         {
           if ($in_chapter)
           {
             fwrite($chapter_txt_out, "\n");
           }
           else
           {
             $frontmatter_text .= "\n";
           }
           $str = '  * ' . $str;
         }
         $in_numbered_item++;
       }
       else
       {
         $str = '* ' . $str;
       }
       for ($i = 0; $i < $bullet_depth; $i++)
       {
         $str = '  ' . $str;
       }
     }
     else
     {
       // Indented text is preceeded by a >
       if ($in_indent)
       {
         $str = '> ' . $str . "\n";
       }
       else
       {
         $str .= "\n";
       }
     }
     if ($bullet_depth == 0)
     {
       $str .= "\n";
     }
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $str);
     }
     else
     {
       $frontmatter_text .= $str;
     }
   }
   // - codified text blocks
   elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
   {
     $code_id = $matches[1];
     // - determine the appropriate code block prefix
     $code_prefix = '';
     if (!$in_code)
     {
       if ($is_code_linenumbered)
       {
         $code_prefix = '<code 1>';
       }
       else
       {
         $code_prefix = '<code>';
       }
       $in_code = true;
     }
     $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $code_txt);
     }
     else
     {
       $frontmatter_text .= $code_txt;
     }
   }
   elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
   {
     $code_id = $matches[1];
     // - determine the appropriate code block prefix
     $code_prefix = '';
     if (!$in_code)
     {
       if ($is_code_linenumbered)
       {
         $code_prefix = '<code 1>';
       }
       else
       {
         $code_prefix = '<code>';
       }
       $in_code = true;
     }
     $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $code_txt);
     }
     else
     {
       $frontmatter_text .= $code_txt;
     }
   }
   elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
   {
     $code_txt = $matches[1];
     // - determine the appropriate code block prefix
     $code_prefix = '';
     if (!$in_code)
     {
       if ($is_code_linenumbered)
       {
         $code_prefix = "<code 1>\n";
       }
       else
       {
         $code_prefix = "<code>\n";
       }
       $in_code = true;
     }
     // - arg. another special case for codelines that span more than one line
     // (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
     // for an element name, eh?)
     $another_line = '';
     while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
     {
       $code_txt .= ' ' . $another_line;
     }
     $code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
     $code_txt = $code_prefix . translateText($code_txt, true);
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $code_txt);
     }
     else
     {
       $frontmatter_text .= $code_txt;
     }
   }
   // - there are also sometimes empty codelines - which indicate a newline in
   //   the code listing
   elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
   {
     $code_txt = '';
     if (!$in_code)
     {
       $code_txt = "\n";
       if ($is_code_linenumbered)
       {
         $code_txt = "<code 1>\n" . $code_txt;
       }
       else
       {
         $code_txt = "<code>\n" . $code_txt;
       }
       $in_code = true;
     }
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $code_txt);
     }
     else
     {
       $frontmatter_text .= $code_txt;
     }
   }
   // - reference to an external XML file
   elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
   {
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, $line);
     }
     else
     {
       $frontmatter_text .= $line;
     }
   }
   elseif (strpos($line, '<FootnoteList>') !== false)
   {
     $in_footnotes = true;
   }
   elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
   {
     $in_footnotes = false;
   }
   // Indentation - the closest thing we have is quoting, so we'll use that
   elseif (strpos($line, '<Indented>') !== false)
   {
     $in_indent = true;
   }
   elseif (strpos($line, '</Indented>') !== false)
   {
     $in_indent = false;
   }
   // - pattern of lines to ignore
   else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
   {
   }
   // - we ignore anything else in footnotes too, as they were handled in the
   //   preprocessing pass
   else if ($in_footnotes)
   {
   }
   // - ignore empty lines
   else if (preg_match('/^\s*$/', $line))
   {
   }
   // - meh. French versions have random, non-text element, linebreaks floating
   //   around. Guess I'll honor their formatting even though it's bogus
   else if (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
   {
     if ($in_chapter)
     {
       fwrite($chapter_txt_out, ' \\\\');
     }
     else
     {
       $frontmatter_text .= ' \\\\';
     }
   }
   // - danger Will Robinson!
   else
   {
     echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': |' . htmlspecialchars($line) . "|<hr /></div>\n";
   }
 }

// 2. We should now have enough metadata to export the cover page
$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
// - backup any existing file
if (file_exists($top_page_path))
 {
   $top_page_backup_path = $top_page_path . '.bak';
   if(!rename($top_page_path, $top_page_backup_path))
   {
     printError('Failed to rename existing top page for backup');
   }
 }
// - and create a handle to the new file
$txt_out = fopen($top_page_path, 'w');
// - write the page (including the tables)
fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
fwrite($txt_out, "\n");

// - *NEW* ability to request imports and exports from within the page
fwrite($txt_out, "<ifauth @admin>\n\n");
fwrite($txt_out, '**Administrator Commands:**' . "\n");
// On second thoughts we probably never want to do this casually, as it boguses
// all history/approval/edit information. Instead I'll leave this as a manual
// process.
fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
fwrite($txt_out, '  * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download|approved version]]' . "\n");
fwrite($txt_out, "</ifauth>\n\n");

// - regular metadata
fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
fwrite($txt_out, '^ Metadata  ^ Value  ^' . "\n");
$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
foreach ($fields as $field)
{
  $values = getMetadata($field);
  foreach ($values as $value)
  {
    fwrite($txt_out, '^ ' . $field . '  | ' . $value . '  |' . "\n");
  }
}
fwrite($txt_out, "\n");
// - contents (which also provides order information for exporting)
fwrite($txt_out, '===== Contents =====' . "\n");
fwrite($txt_out, "\n");
fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
foreach ($page_order as $page_info)
{
  fwrite($txt_out, '  * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
}
// - system metadata
fwrite($txt_out, "<ifauth @admin>\n\n");
fwrite($txt_out, '===== System Metadata =====' . "\n");
fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
fwrite($txt_out, '^ Metadata  ^ Value  ^' . "\n");
$fields = array('ENTITY','SupplementaryText');
foreach ($fields as $field)
{
  $values = getMetadata($field);
  foreach ($values as $value)
  {
    fwrite($txt_out, '^ ' . $field . '  | ' . $value . '  |' . "\n");
  }
}
fwrite($txt_out, "</ifauth>\n\n");
fwrite($txt_out, "\n");
// - done!
fclose($txt_out);
$page_count++;

// 3. And the 'sections' page, grouping together all the loose sections as
//    frontmatter
$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
// - backup any existing file
if (file_exists($frontmatter_page_path))
 {
   $frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
   if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
   {
     printError('Failed to rename existing frontmatter page for backup');
   }
 }
// - populate the new frontmatter file
file_put_contents($frontmatter_page_path, $frontmatter_text);

echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
exit(0);

/**
 */
function addMetadata($field, $value)
{
  global $manual_metadata;
  echo '[adding metadata: ' . $field . "] \n";
  $values = array();
  if (isset($manual_metadata[$field]))
  {
    $values = $manual_metadata[$field];
  }
  array_push($values, $value);
  $manual_metadata[$field] = $values;
}
/** addMetadata() **/

function getFirstMetadata($field)
{
  global $manual_metadata;
  $value = '';
  if (isset($manual_metadata[$field]))
  {
    $values = $manual_metadata[$field];
    if (!empty($values))
    {
      $value = $values[0];
    }
  }
  return $value;
}
/** getFirstMetadata() **/

function getMetadata($field)
{
  global $manual_metadata;
  $values = array();
  if (isset($manual_metadata[$field]))
  {
    $values = $manual_metadata[$field];
  }
  return $values;
}
/** getMetadata() **/

/**
 *  Read in the next title element with nested text element and extract the
 *  title.
 */
function getTitle($xml_in, $element)
{
  $title = '';
  $in_title_element = false;
  // - the first thing in a chapter will be it's title
  $title_line = getLine($xml_in);
  // - super special case: some language versions don't wrap titles in title
  //   element, so if the first thing we see is a text, we treat that as the
  //   title
  if (strpos($title_line, '<text') !== false)
  {

  }
  // - super special case: a table with an empty title
  if (strpos($title_line, '<Title/>') !== false)
  {
    return '';
  }
  if (strpos($title_line, '<Title>') !== false)
  {
    $in_title_element = true;
    $title_line = getLine($xml_in);
  }
  // - some horribly formed entries have the subtitle first within the title
  //   element
  if (strpos($title_line, '<SubTitle>') !== false)
  {
    $title_line = getLine($xml_in);
    if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
    {
      $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
    }
    $title_line = getLine($xml_in);
    if (strpos($title_line, '</SubTitle>') === false)
    {
      printError('Failed to find closing title for: ' . $element);
    }
    $title_line = getLine($xml_in);
  }
  // - grab the chapter title now so we can store it in the page ordering
  if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
  {
    $id = $matches[1];
    $str = $matches[2];
    // - special case for text blocks that span multiple lines (as discovered
    //   in the russian "From Paper"
    $another_line = '';
    while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
    {
      $str .= ' ' . $another_line;
    }
    // - now remove </Text>
    $str = preg_replace('/<\/Text>\s*/', '', $str);
    $title = '<!-- id:' . $id . ' -->' . $str . $title;
  }
  // - special case for (stoopid) empty titles that use up a text id
  elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
  {
    $title = '<!-- id:' . $matches[1] . ' -->' . $title;
  }
  else
  {
    printError('Failed to find title text for: ' . $element);
  }
  // - watch for subtitle elements
  if ($in_title_element)
  {
    $title_line = getLine($xml_in);
    if (strpos($title_line, '<SubTitle>') !== false)
    {
      $title_line = getLine($xml_in);
      if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
      {
        $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
      }
      $title_line = getLine($xml_in);
      if (strpos($title_line, '</SubTitle>') === false)
      {
        printError('Failed to find closing title for: ' . $element);
      }
      $title_line = getLine($xml_in);
    }
    if (strpos($title_line, '</Title>') === false)
    {
      printError('Failed to find closing title for: ' . $element);
    }
  }
  return $title;
}
/** getTitle() **/

function alternateComments($text)
{
  $text = str_replace('<!--', '%!--', $text);
  $text = str_replace('-->', '--%', $text);
  // remove any lurking crossrefs while we are at it
  $text = preg_replace('/<CrossRef.*?ref="([^"]+)".*?>/', '\\1', $text);
  return $text;
}

function noComments($text)
{
  $text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
  return preg_replace('/<!--[^>]+-->/', '', $text);
}

function translateTableCodeline($text)
{
  ///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
  // Escape the current italics tags to prevent the translate destroying them
  $text = str_replace('<i>', '%!--i--%', $text);
  $text = str_replace('</i>', '%/i%', $text);
  // Translate the text, just decoding the entities
  $text = translateText($text, true);
  // Now turn the italic tags (escaped) into HTML comments so we remember them
  // but they are hidden in the text
  $text = str_replace('%!--i--%', '<!--i-->', $text);
  $text = str_replace('%/i%', '<!--/i-->', $text);
  ///cho " =&gt; '" . htmlspecialchars($text) . "<br />\n";
  return $text;
}

function translateText($text, $entities_only=false)
{
  global $entity_replacements;
  global $footnotes;
  global $in_code;

  // - immediate find and protect any legitimate HTML comments in the text
  //   (so already using encoded entities), otherwise they'll be throughly
  //   vladed during the following tranforms. This has to be matched with
  //   changes to the HTMLComments plugin in Dokuwiki to allow the correct
  //   thing to be displayed to the user.
  if (!$in_code)
  {
    $text = str_replace('&lt;!--', '%!--', $text);
    $text = str_replace('--&gt;', '--%', $text);
  }

  if (!$entities_only)
  {
    // - replace linking constructs with dokuwiki ones
    // - external chapter section crossrefs are easily the worst of all...
    while (preg_match('/<CrossRef\s[^>]*external[^>]*\/>/', $text) && preg_match('/<CrossRef\s[^>]*target="Chapter"[^>]*\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
    {
      $pattern = $matches[0];
      $attributes = $matches[1];
      $manual_name = '';
      if (preg_match('/external="([^"]+)"/', $attributes, $matches))
      {
        $manual_name = $matches[1];
      }
      $language = '';
      if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
      {
        $language = $matches[1];
      }
      $page_id = '';
      if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
      {
        $page_id = $matches[1];
      }
      if (empty($manual_name) || empty($language) || empty($page_id))
      {
        printError('Failed to parse external reference: ' . $pattern);
      }
      // - best we can do is a search within a restricted namespace
      $reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '|' . $page_id . ']]';
      $text = str_replace($pattern, $reference, $text);
    }
    // - chapter crossrefs are tricksie due to needing to know ordering numbers
    while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
    {
      $chapter_id = $matches[1];
      $page_name = $chapter_id;
      $text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '|' . $chapter_id . ']]', $text);
    }
    // - internal figure and table references
    $text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
    $text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
    // - simple internal reference
    $text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1|\1]]', $text);
    $text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1|\1]]', $text);
    $text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1|\1]]', $text);
    // - simple external url
    $text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1|\2]]', $text);
    // - footnote references are also tricksie as we've had to extract the
    //   footnotes earlier (during chapter counting)
    while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
    {
      $footnote_id = $matches[1];
      if (!isset($footnotes[$footnote_id]))
      {
        printError('Unknown footnote referenced: ' . $footnote_id);
      }
      $footnote = $footnotes[$footnote_id];
      $text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
    }
    // - detect and handle URLs surrounded by <i> tags very carefully (as doku
    //   will less than helpfully turn them into an external link and screw up
    //   everything that follows them on the page).
    //   example: <i>www.microsoft.com</i>
    //   example: <i>http://nzdl.org/cgi-bin/library</i>
    //   example: <i>www.yourserver.com</i>
    //   example: <i>http://www.yourserver.com</i>
    //   example: <i>http://www.yourserver.com/greenstone</i>
    $text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
    // - superspecial case for two-part URLs ending in .org (like nzdl.org)
    $text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
    // - another superspecial case, this time for URLs on localhost
    $text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
    // - p00p, underlines have the same issue around URLs.
    $text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
    $text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
    $text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);

    // - replace HTML elements with the dokuwiki style equivilents
    $text = preg_replace('/(file|ftp|http):\/\//','\1:%%//%%', $text);
    // - restore the double slashes in dokuwiki links
    while (preg_match('/\[\[[^\]]*%%[^\]]*\]\]/', $text))
    {
      $text = preg_replace('/(\[\[[^\]]*)%%([^\]]*\]\])/', '\1\2', $text);
    }
    $text = str_replace('<b>', '**', $text);
    $text = str_replace('</b>', '**', $text);
    //$text = str_replace(' <br/>', '\\\\ ', $text);
    //$text = str_replace(' <br/>', '\\\\ ', $text);
    //$text = str_replace('<br/>', '\\\\ ', $text);
    $text = str_replace('<i>', '//', $text);
    $text = str_replace('</i>', '//', $text);
    $text = str_replace('<u>', '__', $text);
    $text = str_replace('</u>', '__', $text);
  }
  // Decode entities
  // - user defined entities (in the manual metadata)
  foreach ($entity_replacements as $entity=>$code)
  {
    $text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
  }
  // - standard entities
  $text = str_replace('&gt;','>', $text);
  $text = str_replace('&lt;','<', $text);
  $text = str_replace('&amp;','&', $text);
  return $text;
}
/** translateText() **/

function handleImage($filename, $width, $height)
{
  global $dokuwiki_path;
  global $xml_source_path;
  echo '[copying image: ' . $filename . "] \n";
  // - copy file into place
  $source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
  $destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
  if (!file_exists($destination_dir))
  {
    mkAllDir($destination_dir, 0755);
  }
  $destination_path = $destination_dir . strtolower($filename);
  copy($source_path, $destination_path);
  if (!file_exists($destination_path))
  {
    printError('Failed to copy image file: ' . $filename);
  }
  // - create the string
  $image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
  return $image_txt;
}

function getLine($in)
{
  global $line_counter;
  $line_counter++;
  return fgets($in);
}

function noFormatting($text)
{
  $text = str_replace( '<b>', '', $text);
  $text = str_replace('</b>', '', $text);
  $text = str_replace( '<i>', '', $text);
  $text = str_replace('</i>', '', $text);
  $text = str_replace('<br/>', '', $text);
  return $text;
}

?>