<?php

require_once('common.php');
$debug = 1;

// There are some elements that are, in my opinion, incorrectly 'resolved' in
// code blocks just because they are valid HTML. For instance, &lt;i&gt; is
// resolved to <i> in code blocks, while something like &lt;Metadata&gt; is
// not. Set this to true to allow such abominations (for the purpose of
// comparing before and after versions of the XML). [jmt12]
$allow_bad_codeblocks = true;

/** @file gs-manual-export.php
 *  This script transforms the series of dokuwiki pages that make up a certain
 *  manual (as specified by the 'm' argument) in a certain language ('l') into
 *  the XML format required by the rest of the Greenstone manual generation
 *  scripts.
 */

// 0. Initialization

// - we have a counter to assign identifiers to text blocks etc without ids
$text_id_counter = 1;
// - we need an array of all the footnotes
$footnotes = array();

// Defaults
if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
{
  $_REQUEST['l'] = 'en';
}
if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
{
  //$_REQUEST['m'] = 'user';
  //$_REQUEST['m'] = 'install';
  $_REQUEST['m'] = 'develop';
  //$_REQUEST['m'] = 'paper';
}
if (!isset($_REQUEST['v']) || empty($_REQUEST['v']))
{
  $_REQUEST['v'] = 'draft';
}
if (!isset($_REQUEST['a']) || !preg_match('/^(download|store)$/', $_REQUEST['a']))
{
  $_REQUEST['a'] = 'store'; // Try to store the file to disk
}

$var_path = filecat(array($base_path, 'var'));
$timestamp = time();
//$xml_source_path = '**PATH TO GSDL MANUALS**'
$xml_source_path = '/tmp';
if ($_REQUEST['a'] == 'download')
{
  // Clear out previous exports
  recursiveRemove($var_path, '/greenstone/greenstone-documentation/php/var');
  // New export
  $xml_source_path = fileCat(array($var_path, $timestamp));
}

echo '<html>' . "\n";
echo '<head>' . "\n";
echo '<title>GS Manual Export</title>' . "\n";
echo '</head>' . "\n";
echo '<body>' . "\n";

// - validate arguments before we use them (security)
if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
 {
   printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
 }

if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
 {
   printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
 }

echo '<h2>Generating Greenstone Manual XML</h2>' . "\n";
echo '<p><b>Manual:</b> ' . $_REQUEST['m'] . ' <b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
// 1. Create the XML output file handle
// - construct the path using the information we've been provided as arguments
$xml_file_dir = $xml_source_path . '/' . $_REQUEST['l'];
mkAllDir($xml_file_dir);
$xml_file_path = $xml_file_dir . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
// - backup any existing file
if (file_exists($xml_file_path))
{
  $xml_backup_file_path = $xml_file_path . '.bak';
  if (!rename($xml_file_path, $xml_backup_file_path))
  {
    printError('Failed to rename existing manual file for backup');
  }
}

// - and create a handle to the new file
$xml_out = fopen($xml_file_path, 'w');

// 2. Read in the top level page - this will give configuration data for the
//    manual and cover page, as well as specifying the order for the other
//    pages in the manual
echo "<p><b>Frontmatter:</b><br/>\n";
// - by reading this page we hope to populate an array of metadata, and also
//   extract the sequence of other pages within this manual
$cover_metadata = array();
$pages_in_order = array();
// - we now need to consider if the user has asked for a draft version (i.e.
//   includes the latest version of pages regardless of approval) or if only
//   the approved versions of pages should be included
// - only necessary for english version of manual, as those are the only pages
//   editable
$top_page_path = '';
if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
{
  // - again, we can construct the path to the top level page given the arguments
  //   provided
  $top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
}
else
{
  $top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']);
}

if (!file_exists($top_page_path))
 {
   printError('Failed to locate top level page for manual');
 }
// - we're going to open a handle to the file, then read it in line-by-line
//   watching for the lines we are interested in (using pattern matching)
$top_page_in = fopen($top_page_path, 'r');
if (!$top_page_in)
 {
   printError('Failed to open top level page for reading');
 }
$in_contents = false;
while (($line = fgets($top_page_in)) !== false)
 {
   // - if we are capturing page order, and we encounter something that looks
   //   like a bulletpoint item pointing to a wiki page, then we append the
   //   name of that page to our pages in order array
   if ($in_contents && preg_match('/^\s+\*\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\|(.*?)\]\]\s*$/', $line, $matches))
   {
     array_push($pages_in_order, $matches[1]);
   }
   // - metadata is all encoded within dokuwiki tables
   elseif (preg_match('/^\^\s+([^\s]+)\s+\|\s+(.+?)\s+\|\s*$/', $line, $matches))
   {
     $field = $matches[1];
     $value = $matches[2];
     $values = array();
     if (isset($cover_metadata[$field]))
     {
       $values = $cover_metadata[$field];
     }
     array_push($values, $value);
     $cover_metadata[$field] = $values;
   }
   // - watch for the heading 'Contents' to begin extracting page order
   //   information
   elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches))
   {
     if ($matches[1] == 'Contents')
     {
       $in_contents = true;
     }
     // - any other title means we aren't capturing page order (anymore)
     else
     {
       $in_contents = false;
     }
   }
 }
if (!feof($top_page_in))
 {
   printError('Unexpected fgets() fail when reading top page');
 }
fclose($top_page_in);
// - ensure we have the required metadata
$required_metadata = array('Heading','Title','Affiliation','Version','Date');
foreach ($required_metadata as $required_field)
{
  if (!isset($cover_metadata[$required_field]))
  {
    printError('Missing required metadata: ' . $required_field);
  }
}
// - now we can use the metadata to construct the XML header and the cover page.
//   This follows a pretty set recipe with only elements that can repeat---like
//   Author, SupplementaryText etc---are at all tricky
fwrite($xml_out, '<?xml version="1.0" encoding="UTF-8"?>' . "\n");
fwrite($xml_out, '<!DOCTYPE Manual [' . "\n");
if (isset($cover_metadata['ENTITY']))
 {
   foreach ($cover_metadata['ENTITY'] as $entity)
   {
     fwrite($xml_out, "\t" . '<!ENTITY ' . $entity . '>' . "\n");
   }
 }
fwrite($xml_out, ']>' . "\n");
fwrite($xml_out, '<Manual id="' . ucfirst($_REQUEST['m']) . '" lang="' . $_REQUEST['l'] . '">' . "\n");

///cho "<p>[Debug] metadata: " . print_r($cover_metadata, true) . "</p>\n\n";

outputMetadataSingle($xml_out, $cover_metadata, 'Heading');
outputMetadataSingle($xml_out, $cover_metadata, 'Title');
outputMetadataSingle($xml_out, $cover_metadata, 'Author');
outputMetadataSingle($xml_out, $cover_metadata, 'Affiliation');
outputMetadataMultiple($xml_out, $cover_metadata, 'SupplementaryText');
outputMetadataMultiple($xml_out, $cover_metadata, 'Text');
outputMetadataMultiple($xml_out, $cover_metadata, 'Comment');
outputMetadataSingle($xml_out, $cover_metadata, 'Version');
outputMetadataSingle($xml_out, $cover_metadata, 'Date');

// 3. Process each page listed in the contents of the top level page in order
foreach ($pages_in_order as $page)
{
  processPage($xml_out, $page);
}

// 4. Output out list of footnotes (if any)
if (!empty($footnotes))
 {
   fwrite($xml_out, '<FootnoteList>'. "\n");
   foreach ($footnotes as $footnote=>$footnote_id)
   {
     ///cho '[debug] footnotes: (' . $footnote_id . ') ' . $footnote . '<br />'. "\n";
     outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id);
   }
   fwrite($xml_out, '</FootnoteList>'. "\n");
 }

// 5. Finalize and close the XML output
fwrite($xml_out, '</Manual>' . "\n");
fclose($xml_out);
chmod($xml_file_path, 0664);

// 6. Complete!
echo '<p><b>Complete!</b></p>' . "\n<hr/>\n";
if ($_REQUEST['a'] == 'download')
{
  // Zip up the manual files
  $zip_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.zip';
  $zip_path = fileCat(array($xml_source_path, $zip_file));
  $zip_command = 'zip -r "' . $zip_path . '" . > /dev/null 2>&1';
  //cho '<p><b>[DEBUG]</b> zip_command:' . $zip_command . '</p>';
  $tgz_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.tgz';
  $tgz_path = fileCat(array($xml_source_path, $tgz_file));
  $tgz_command = 'tar -czf "' . $tgz_path . '" * > /dev/null 2>&1';
  //cho '<p><b>[DEBUG]</b> tgz_command:' . $tgz_command . '</p>';
  // We need to move to the document folder so that archives have sensible paths
  $original_cwd = getcwd();
  chdir($xml_file_dir);
  system($zip_command);
  system($tgz_command);
  // Go back
  chdir($original_cwd);
  // Links are ready
  echo '<p>Download XML file plus images as: <a href="var/' . $timestamp . '/' . $zip_file . '">ZIP</a> or <a href="var/' . $timestamp . '/' . $tgz_file . '">TGZ</a></p>' . "\n";
}
echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to dokuwiki</p>' . "\n";
echo '</body>' . "\n";
echo '</html>';
exit(0);

/**
 */
function outputMetadataSingle($xml_out, $metadata, $field, $mid=false)
{
  echo '[metadata: ' . $field . "] \n";
  if ($mid)
  {
    fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n");
  }
  else
  {
    fwrite($xml_out, '<' . $field . '>' . "\n");
  }
  if (is_array($metadata))
  {
    if (isset($metadata[$field]) && isset($metadata[$field][0]))
    {
      outputTextBlock($xml_out, $metadata[$field][0]);
    }
    else
    {
      echo 'no such field or no metadata';
    }
  }
  elseif (!empty($metadata))
  {
    outputTextBlock($xml_out, $metadata);
  }
  else
  {
    echo 'no such field or no metadata';
  }
  fwrite($xml_out, '</' . $field . '>' . "\n");
}
/** outputMetadataSingle() **/

function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false)
{
  echo '[metadata list: ' . $field . "] \n";
  fwrite($xml_out, '<' . $field . '>' . "\n");
  if (isset($metadata[$field]))
  {
    if (count($metadata[$field]) == 1)
    {
      outputTextBlock($xml_out, $metadata[$field][0]);
    }
    if (count($metadata[$field]) > 1)
    {
      $last_value = '';
      if ($final_separator)
      {
        $last_value = array_pop($metadata[$field]);
      }
      $values = implode($separator, $metadata[$field]);
      if ($final_separator)
      {
        $values .= $final_separator . $last_value;
      }
      outputTextBlock($xml_out, $values);
    }
  }
  else
  {
    echo 'no such field or no metadata';
  }
  fwrite($xml_out, '</' . $field . '>' . "\n");
}
/** outputMetadataList() **/

function outputMetadataMultiple($xml_out, $metadata, $field)
{
  echo '[metadata multiple: ' . $field . "] \n";
  // - Text blocks don't need to be wrapped in Text element
  if ($field != 'Text')
  {
    fwrite($xml_out, '<' . $field . '>' . "\n");
  }
  if (isset($metadata[$field]))
  {
    foreach ($metadata[$field] as $value)
    {
      outputTextBlock($xml_out, $value);
    }
  }
  else
  {
    echo 'no such field or no metadata';
  }
  if ($field != 'Text')
  {
    fwrite($xml_out, '</' . $field . '>' . "\n");
  }
}

function translateTableCodeline($text)
{
  // Escape any italic tags hidden in HTML comments
  $text = str_replace('<!--i-->', '%!--i--%', $text);
  $text = str_replace('<!--/i-->', '%!--/i--%', $text);
  // Encode entities etc
  $text = translateText($text, true);
  // Restore any italics elements hidden above
  $text = str_replace('%!--i--%', '<i>', $text);
  $text = str_replace('%!--/i--%', '</i>', $text);
  return $text;
}
/** translateTableCodeline() **/

function translateText($text, $in_code_block=false)
{
  global $allow_bad_codeblocks;
  $text = str_replace('&','&amp;',$text);
  $text = str_replace('<','&lt;',$text);
  $text = str_replace('>','&gt;',$text);
  if ($in_code_block && $allow_bad_codeblocks)
  {
    ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
    $text = str_replace('&lt;i&gt;','<i>',$text);
    $text = str_replace('&lt;/i&gt;','</i>',$text);
    $text = str_replace('&lt;br/&gt;','<br/>',$text);
  }
  return $text;
}

function outputTextBlock($xml_out, $text, $type='', $in_code_block = false)
{
  global $cover_metadata;
  global $text_id_counter;
  global $footnotes;
  global $allow_bad_codeblocks;

  // - Start by dealing with any footnotes before anything else
  while (preg_match('/\(\((.*?)\)\)/', $text, $matches))
  {
    $pattern = $matches[0];
    $footnote = $matches[1];
    $footnote_id = count($footnotes) + 1;
    $footnotes[$footnote] = $footnote_id;
    // - note that we have to escape the footnote reference as the following
    //   code will convert any < and > to entities...
    $footnote_reference = '%FootnoteRef id="' . $footnote_id . '"/%';
    $text = str_replace($pattern, $footnote_reference, $text);
  }

  $text_id = '';
  // - check whether the string begins with an explicit id
  if (preg_match('/^\s*<!--\s*id:(.+?)\s*-->(.*)$/', $text, $matches))
  {
    $text_id = $matches[1];
    $text = $matches[2];
    if (is_numeric($text_id))
    {
      $text_id_counter = $text_id + 1;
    }
  }
  else
  {
    $text_id = $text_id_counter;
    $text_id_counter++;
  }

  // - protect the special case of an HTML comment being actually displayed
  //   in the text
  $text = preg_replace('/<!--([\s\.]+?)-->/','##lt##!--\1--##gt##',$text);

  // - reformat dokuwiki syntax to HTML tag syntax
  $text = preg_replace('/<!--.*?-->/', '', $text);

  // we leave code blocks alone in terms of ampersands
  if (!$in_code_block)
  {
    // - ampersands aren't safe in XML...
    $text = str_replace('&', '&amp;', $text);
    // ...except for the entities that we have registered as metadata
    if (isset($cover_metadata['ENTITY']))
    {
      foreach ($cover_metadata['ENTITY'] as $entity)
      {
        if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
        {
          $entity_name = $matches[1];
          if ($entity_name != 'mdash')
          {
          $entity_character = html_entity_decode('&#'.$matches[2].';',ENT_NOQUOTES,'UTF-8');
          $text = str_replace('&amp;' . $entity_name . ';', '&' . $entity_name . ';', $text);
          // - we also convert any characters that match the entity char into
          //   the entity
          $text = str_replace($entity_character, '&' . $entity_name . ';', $text);
          }
        }
      }
    }
    // - protect <br/> tags
    $text = str_replace('<br/>','%%br/%%',$text);
    // - encoding all of the < and > that appear in the text (rather than
    //   true html formatting)
    $text = str_replace('<','&lt;',$text);
    $text = str_replace('>','&gt;',$text);
    // - restore <br/> tags
    $text = str_replace('%%br/%%','<br/>',$text);
  }
  else if ($type == 'code')
  {
    $text = str_replace('<','&lt;',$text);
    $text = str_replace('>','&gt;',$text);
  }

  // - links, oh how I hate thee
  // - external links are slightly easier
  $text = preg_replace('/\[\[http:\/\/(.*?)\|(.*?)\]\]/', '<Link url="http://\1">\2</Link>', $text);
  // - internals have to become the horrible <CrossRef> tags. We ignore any
  //   number prefix on the page name as that is just used for ordering within
  //   Dokuwiki
  $text = preg_replace('/\[\[\.\:(.*?)\|[^\]]+\]\]/','<CrossRef target="Chapter" ref="\1"/>', $text);
  // - internal links starting with hash must be on the same page
  $text = preg_replace('/\[\[###(.*?)\|.*?\]\]/','<CrossRef target="Part" ref="\1"/>', $text);
  $text = preg_replace('/\[\[##(.*?)\|.*?\]\]/','<CrossRef target="Subsection" ref="\1"/>', $text);
  $text = preg_replace('/\[\[#(.*?)\|.*?\]\]/','<CrossRef target="Section" ref="\1"/>', $text);
  // - 'external' internal wiki links are even worst - since we can't know what
  //   the page order number for another manual's chapters might be, we instead
  //   use a search
  $text = preg_replace('/\[\[\?do\=search\&amp;id\=([^\s]+)\s+@([a-z]+):manuals:([a-z]+)\|.*?\]\]/i', '<CrossRef external="\3" lang="\2" target="Chapter" ref="\1"/>', $text);
  // - references to images and tables
  $text = preg_replace('/(?:<|&lt;)imgref\sfigure_(.+?)(?:>|&gt;)/','<CrossRef target="Figure" ref="\1"/>', $text);
  $text = preg_replace('/(?:<|&lt;)tblref\stable_(.+?)(?:>|&gt;)/','<CrossRef target="Table" ref="\1"/>', $text);
  // - explicitly convert URLs as they are a bit messy
  // - first all the cases of URLs in italics, without protocol
  $text = preg_replace('/\/\/\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
  $text = preg_replace('/\/\/\s([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
  $text = preg_replace('/\/\/\s(localhost(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
  // - now all the protocol ones (with care taken to protect // in protocol)
  $text = preg_replace('/\/\/\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
  $text = preg_replace('/\/\/\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
  $text = preg_replace('/\/\/\shttp:\/\/(localhost(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
  // - next we have the underlined URLs sans protocols
  $text = preg_replace('/__\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
  $text = preg_replace('/__\s([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
  $text = preg_replace('/__\s(localhost(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
  // - and finally the protocol prefixed underlined URLs
  $text = preg_replace('/__\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
  $text = preg_replace('/__\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
  $text = preg_replace('/__\shttp:\/\/(localhost(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
  // - lets also protect any other protocols we find floating around
  $text = preg_replace('/(file|ftp|http):\/\//i', '\1:##DOUBLESLASH##', $text);

  // - italic formatting (taking care of protected double slashes)
  $text = preg_replace('/%%\/\/%%/', '##DOUBLESLASH##', $text);
  $text = preg_replace('/\/{5}/', '<i>/</i>', $text); // another special case
  $text = preg_replace('/\/\/(\/.+?)\s*\/\//', '<i>\1</i>', $text); // another special case
  $text = preg_replace('/\/\/\s*(.+?\/)\/\//', '<i>\1</i>', $text); // another special case
  $text = preg_replace('/\/\/\s*(.+?)\s*\/\//', '<i>\1</i>', $text);
  $text = preg_replace('/##DOUBLESLASH##/', '//', $text);
  // - bold formatting
  $text = preg_replace('/\*\*([^"]+?)\*\*/', '<b>\1</b>', $text);
  // - underline formatting
  $text = preg_replace('/__([^"]+?)__/', '<u>\1</u>', $text);

  // - decode certain entities in codeblock (just because they are valid HTML,
  //   derp).
  if ($in_code_block && $allow_bad_codeblocks)
  {
    ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
    $text = str_replace('&lt;i&gt;','<i>',$text);
    $text = str_replace('&lt;/i&gt;','</i>',$text);
    //$text = str_replace('&lt;br/&gt;','<br/>',$text);
  }
  // - restore protected entities
  $text = preg_replace('/##(gt|lt)##/','&\1;',$text);
  // - restore protected comment blocks
  $text = str_replace('%!--', '&lt;!--', $text);
  $text = str_replace('--%', '--&gt;', $text);
  // - restore protected footnote refs
  $text = preg_replace('/%FootnoteRef id="([^"]+)"\/%/', '<FootnoteRef id="\1"/>', $text);
  // output the text block
  $text = trim($text);
  if (empty($text))
  {
    fwrite($xml_out, '<Text id="' . $text_id . '"/>' . "\n");
  }
  else if (!empty($type))
  {
    fwrite($xml_out, '<Text type="' . $type . '" id="' . $text_id . '">' . $text . '</Text>' . "\n");
  }
  else
  {
    fwrite($xml_out, '<Text id="' . $text_id . '">' . $text . '</Text>' . "\n");
  }
}
/** outputTextBlock($xml_out, $text) **/

/**
 */
function processPage($xml_out, $page_name)
{
  global $dokuwiki_path;
  global $seen_ids;
  echo "</p>\n<p><b>Export Chapter:</b> " . $page_name . "<br/>\n";
  // - locate the page in question (taking into account if the user asked for a
  //   draft version or an approved version of the manual)
  $page_path = '';
  if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
  {
    $page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt';
  }
  else
  {
    $page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name);
  }
  $page_in = @fopen($page_path, 'r');
  if (!$page_in)
  {
    printError('Failed to open page for reading:' . $page_name, false);
    return;
  }
  // - once again we read in line-by-line, but this time we are going to output
  //   each line as we go through. We expect to encounter certain lines in a
  //   predefined order, and should complain if we don't find what we expect.
  $in_chapter = false;
  $in_section = false;
  $in_subsection = false;
  $in_part = false;
  $in_list = false;
  $lists = array();
  $previous_listitem_type = '';
  $in_figure = false;
  $in_table = false;
  $column_widths = array();
  $in_code_block = false;
  while (($line = fgets($page_in)) !== false)
  {
    // remove newline character
    $line = preg_replace('/\r?\n$/','',$line);
    // - we need to know the 'depth' for the bulletpoint lists
    $depth = 0;
    while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == '  ')
    {
      $depth++;
      $line = substr($line, 2);
    }
    $first_character = substr($line, 0, 1);
    // - special case for the end of bullet lists
    if ($in_list && ($first_character != "*" && $first_character != "-"))
    {
      while (count($lists) > 0)
      {
        $list_type = array_pop($lists);
        if ($list_type == '*')
        {
          fwrite($xml_out, '</Bullet>' . "\n");
          fwrite($xml_out, '</BulletList>' . "\n");
        }
        else
        {
          fwrite($xml_out, '</NumberedItem>' . "\n");
          fwrite($xml_out, '</NumberedList>' . "\n");
        }
      }
      $in_list = false;
    }
    // - special case for the end of tables
    if ($in_table && $first_character != '^' && $first_character != '|')
    {
      fwrite($xml_out, '</TableContent>' . "\n");
      fwrite($xml_out, '</Table>' . "\n");
      $in_table = false;
    }
    // - special cases for premature closing of sections, subsections and parts
    if (preg_match('/<!-- close:(section|subsection|part) -->/', $line, $matches))
    {
      // - we always try to do this (regardless of actual flag) as we must
      //   always close the smallest 'granularity' first
      if ($in_part)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Part>' . "\n");
        $in_part = false;
      }
      if ($in_subsection && ($matches[1] == 'section' || $matches[1] == 'subsection'))
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Subsection>' . "\n");
        $in_subsection = false;
      }
      if ($in_section && $matches[1] == 'section')
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Section>' . "\n");
        $in_section = false;
      }
    }

    // - if this page is a chapter, then the first thing on the page should be
    //   the chapter title (six equals)
    if (preg_match('/====== (.+) ======/', $line, $matches))
    {
      $chapter_title = $matches[1];
      $chapter_id = $page_name;
      if (empty($chapter_id))
      {
        $chapter_id = generateID($chapter_title);
      }
      // - are we already processing a part? if so end it, end it now
      if ($in_part)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Part>' . "\n");
        $in_part = false;
      }
      // - are we already processing a subsection? if so end it, end it now
      if ($in_subsection)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Subsection>' . "\n");
        $in_subsection = false;
      }
      // - are we already processing a section? if so end it, end it now
      if ($in_section)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Section>' . "\n");
        $in_section = false;
      }
      // - are we already processing a chapter? if so end it, end it now
      if ($in_chapter)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Chapter>' . "\n");
        $in_chapter = false;
      }
      // - write out this chapter's header
      fwrite($xml_out, '<Chapter id="' . $chapter_id . '">' . "\n");
      outputMetadataSingle($xml_out, $chapter_title, 'Title');
      fwrite($xml_out, '<Content>' . "\n");
      $in_chapter = true;
    }
    // - the next likely thing to encounter is a section heading (five equals)
    elseif (preg_match('/=====\s+(.+)\s+=====/', $line, $matches))
    {
      $section_title = $matches[1];
      // - check for explicit section id
      $section_id = '';
      if (preg_match('/<!-- sid:(.+?) -->(.*)/', $section_title, $matches))
      {
        $section_id = $matches[1];
        $section_title = $matches[2];
      }
      if (empty($section_id))
      {
        $section_id = generateID($section_title);
      }
      // - are we already processing a part? if so end it, end it now
      if ($in_part)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Part>' . "\n");
        $in_part = false;
      }
      // - are we already processing a subsection? if so end it, end it now
      if ($in_subsection)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Subsection>' . "\n");
        $in_subsection = false;
      }
      // - are we already processing a section? if so end it, end it now
      if ($in_section)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Section>' . "\n");
        $in_section = false;
      }
      // - write out this section's header
      fwrite($xml_out, '<Section id="' . $section_id . '">' . "\n");
      outputMetadataSingle($xml_out, $section_title, 'Title');
      fwrite($xml_out, '<Content>' . "\n");
      $in_section = true;
    }
    // - similar for subsection heading (four equals)
    elseif (preg_match('/==== (.+) ====/', $line, $matches))
    {
      $subsection_title = $matches[1];
      // - check for explicit subsection id
      $subsection_id = '';
      if (preg_match('/<!-- sid:(.+?) -->(.*)/', $subsection_title, $matches))
      {
        $subsection_id = $matches[1];
        $subsection_title = $matches[2];
      }
      if (empty($subsection_id))
      {
        $subsection_id = generateID($subsection_title);
      }
      // - are we already processing a part? if so end it, end it now
      if ($in_part)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Part>' . "\n");
        $in_part = false;
      }
      // - are we already processing a subsection? if so end it, end it now
      if ($in_subsection)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Subsection>' . "\n");
        $in_subsection = false;
      }
      // - write out this subsection's header
      fwrite($xml_out, '<Subsection id="' . $subsection_id . '">' . "\n");
      outputMetadataSingle($xml_out, $subsection_title, 'Title');
      fwrite($xml_out, '<Content>' . "\n");
      $in_subsection = true;
    }
    // - and part heading (three equals)
    elseif (preg_match('/=== (.+) ===/', $line, $matches))
    {
      $part_title = $matches[1];
      // - check for explicit part id
      $part_id = '';
      if (preg_match('/<!-- sid:(.+?) -->(.*)/', $part_title, $matches))
      {
        $part_id = $matches[1];
        $part_title = $matches[2];
      }
      if (empty($part_id))
      {
        $part_id = generateID($part_title);
      }
      // - are we already processing a part? if so end it, end it now
      if ($in_part)
      {
        fwrite($xml_out, '</Content>' . "\n");
        fwrite($xml_out, '</Part>' . "\n");
        $in_part = false;
      }
      // - write out this part's header
      fwrite($xml_out, '<Part id="' . $part_id . '">' . "\n");
      outputMetadataSingle($xml_out, '**//' . $part_title . '//**', 'Title');
      fwrite($xml_out, '<Content>' . "\n");
      $in_part = true;
    }
    // - Ignore 5th level heading - they are only used to allow more convenient
    //   editing of figures and tables
    elseif (preg_match('/== (.+) ==/', $line, $matches))
    {
    }
    // - lists need special handling
    elseif (preg_match('/^(\*|\-)\s+(.*)/', $line, $matches))
    {
      $list_type = $matches[1];
      $list_text = $matches[2];
      $list_depth = count($lists);
      if (!$in_list)
      {
        if ($list_type == '*')
        {
          fwrite($xml_out, '<BulletList>' . "\n");
        }
        else
        {
          fwrite($xml_out, '<NumberedList>' . "\n");
        }
        $in_list = true;
        array_push($lists, $list_type);
      }
      // - this bullet is at the same depth as previous - close the previous
      //   point
      elseif ($depth == $list_depth)
      {
        $previous_list_type = end($lists);
        if ($previous_list_type == '*')
        {
          fwrite($xml_out, '</Bullet>' . "\n");
        }
        else
        {
          fwrite($xml_out, '</NumberedItem>' . "\n");
        }
        // - we don't match in type anymore... close the previous list and open
        //   a new list of the appropriate type
        if ($list_type != $previous_list_type)
        {
          if ($previous_list_type == '*')
          {
            fwrite($xml_out, '</BulletList>' . "\n");
            fwrite($xml_out, '<NumberedList>' . "\n");
          }
          else
          {
            fwrite($xml_out, '</NumberedNumbered>' . "\n");
            fwrite($xml_out, '<BulletList>' . "\n");
          }
          array_pop($lists);
          array_push($lists, $list_type);
        }
      }
      else
      {
        // - we have either got deeper...
        if ($depth > $list_depth)
        {
          if ($list_type == '*')
          {
            fwrite($xml_out, '<BulletList>' . "\n");
          }
          else
          {
            fwrite($xml_out, '<NumberedList>' . "\n");
          }
          array_push($lists, $list_type);
        }
        // ... or shallower in the bullet listing
        if ($depth < $list_depth)
        {
          $previous_list_type = array_pop($lists);
          if ($previous_list_type == '*')
          {
            fwrite($xml_out, '</Bullet>' . "\n");
            fwrite($xml_out, '</BulletList>' . "\n");
          }
          else
          {
            fwrite($xml_out, '</NumberedItem>' . "\n");
            fwrite($xml_out, '</NumberedList>' . "\n");
          }
          // - we still have to close the last item too
          $previous_listitem_type = end($lists);
          if ($previous_listitem_type == '*')
          {
            fwrite($xml_out, '</Bullet>' . "\n");
          }
          else
          {
            fwrite($xml_out, '</NumberedItem>' . "\n");
          }
        }
      }
      if ($list_type == '*')
      {
        fwrite($xml_out, '<Bullet>' . "\n");
      }
      else
      {
        fwrite($xml_out, '<NumberedItem>' . "\n");
      }
      // Special Case: bullets that contain (start) a code block
      if (preg_match('/^(.*)<code>\s*$/', $list_text, $matches))
      {
        $list_text = $matches[1];
        $in_code_block = true;
      }

      outputTextBlock($xml_out, $list_text);

      // - to make things clearer, we'll process any and all code blocks within
      //   bullets here - especially as there may be more text block *after*
      //   the code block finishes
      if ($in_code_block)
      {
        $sub_line = '';
        while ($in_code_block && ($sub_line = fgets($page_in)) !== false)
        {
          $sub_line = trim($sub_line);
          // - closing code
          if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches))
          {
            $sub_line = $matches[1]; // may be empty string
            $in_code_block = false;
          }
          // - output another plain codeline
          else
          {
            fwrite($xml_out, '<CodeLine>' . $sub_line . "</CodeLine>\n");
            $sub_line = '';
          }
        }
        // - if sub_line still has anything in it, then add that content as a
        //   text block
        if (!empty($sub_line))
        {
          outputTextBlock($xml_out, $sub_line);
        }
      }
    }
    // - images start with an image caption 'element'
    elseif (preg_match('/<imgcaption\s+figure_([a-z0-9_\-]+)\|(.+)>([^<]*?)<\/imgcaption>/', $line, $matches))
    {
      $figure_id = $matches[1];
      $figure_title = $matches[2];
      $image_content = $matches[3];
      // - watch for the special withLineNumber flag
      $class_attribute = '';
      if (strpos($figure_title, '%!-- withLineNumber --%') != false)
      {
        $class_attribute = ' class="withLineNumber"';
        $figure_title = str_replace('%!-- withLineNumber --%','',$figure_title);
      }
      fwrite($xml_out, '<Figure id="' . $figure_id . '"' . $class_attribute . '>' . "\n");
      echo '[figure: ' . $figure_id . "] \n";
      fwrite($xml_out, '<Title>' . "\n");
      // - decode any comments in the title (used to store explicit id
      //   information)
      $figure_title = str_replace('%!--', '<!--', $figure_title);
      $figure_title = str_replace('--%', '-->', $figure_title);
      // - special case: the title may have a subtitle (as a prefix)
      $figure_subtitle_id = '';
      $figure_subtitle = '';
      // - subtitle with explicit id
      if (preg_match('/^(<!-- id:.+? -->\([a-z]\))\s*(.*)$/', $figure_title, $matches))
      {
        $figure_subtitle = $matches[1];
        $figure_title = $matches[2];
      }
      // - subtitle without explicit id
      else if (preg_match('/^(\([a-z]\))\s*(.*)$/', $figure_title, $matches))
      {
        $figure_subtitle = $matches[1];
        $figure_title = $matches[2];
      }
      outputTextBlock($xml_out, $figure_title);
      if (!empty($figure_subtitle))
      {
        fwrite($xml_out, '<SubTitle>' . "\n");
        outputTextBlock($xml_out, $figure_subtitle);
        fwrite($xml_out, '</SubTitle>' . "\n");
      }
      fwrite($xml_out, '</Title>' . "\n");
      // Try and find the image itself
      if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $image_content))
      {
        processImage($xml_out, $line);
        fwrite($xml_out, '</Figure>' . "\n");
      }
      // Didn't find an image? Weird, but mark the imgcaption as open, and
      // we'll chomp up the next image found as the content.
      else
      {
          $in_figure = true;
      }
      // - record the id to prevent repeating
      $seen_ids[$figure_id] = true;
    }
    // - tables start with a table caption 'element'
    elseif (preg_match('/<tblcaption\s+table_([a-z0-9_\-]+)\|([^>]+)>\s*<\/tblcaption>/', $line, $matches))
    {
      $table_id = $matches[1];
      $table_title = $matches[2];
      if ($table_title == '##NOCAPTION##')
      {
        echo '[non-captioned table: ' . $table_id . "] \n";
        // - watch for autogenerated ids... no point in outputting them
        if (preg_match('/^table(_\d+)?$/', $table_id))
        {
          fwrite($xml_out, "<Table>\n");
        }
        else
        {
          fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
        }
        fwrite($xml_out, '<Title/>' . "\n");
      }
      elseif ($table_title == '##HIDDEN##')
      {
        echo '[hidden table: ' . $table_id . "] \n";
        // - watch for autogenerated ids... no point in outputting them
        if (preg_match('/^table(_\d+)?$/', $table_id))
        {
          fwrite($xml_out, "<Table class=\"hidden\">\n");
        }
        else
        {
          fwrite($xml_out, '<Table class="hidden" id="' . $table_id . '">' . "\n");
        }
        fwrite($xml_out, '<Title/>' . "\n");
      }
      else
      {
        echo '[table: ' . $table_id . "] \n";
        // - watch for autogenerated ids... no point in outputting them
        if (preg_match('/^table(_\d+)?$/', $table_id))
        {
          fwrite($xml_out, "<Table>\n");
        }
        else
        {
          fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
        }
        fwrite($xml_out, '<Title>' . "\n");
        outputTextBlock($xml_out, $table_title);
        fwrite($xml_out, '</Title>' . "\n");
      }
      fwrite($xml_out, '<TableContent>' . "\n");
      $in_table = true;
      // - record the id to prevent repeating
      $seen_ids[$table_id] = true;
    }
    // - the second line in a table should be it's column width values
    elseif (preg_match('/\|<\s-\s([0-9 ]+?)\s>\|/', $line, $matches))
    {
      $column_widths = explode(' ', $matches[1]);
    }
    // - then every row will be made of a number of cells
    elseif (preg_match('/^\|(.*?)\|$/', $line, $matches))
    {
      $row_content = $matches[1];
      $cell_contents = preg_split('/(\s+\||\|\s+)/', $row_content);
      fwrite($xml_out, '<tr>' . "\n");
      foreach ($cell_contents as $index=>$cell_content)
      {
        $cell_content = trim($cell_content);
        $th_text = '';
        if (isset($column_widths[$index]))
        {
          $th_text = '<th width="' . $column_widths[$index] . '"';
        }
        else
        {
          $th_text = '<th';
        }
        // - if the cell would be empty, we use the shorthand
        if (empty($cell_content))
        {
          $th_text .= '/>' . "\n";
          fwrite($xml_out, $th_text);
        }
        else
        {
          $th_text .= '>' . "\n";
          fwrite($xml_out, $th_text);

          // GAH - this is proving harder than a hard thing thats hard.
          // The issue is that the most straightforward way of fixing this,
          // namely using explicit newlines (\\) in the dokuwiki txt causes
          // lots a legitimately translated <br/> to also be split up. I
          // think the only way forward would be to maybe extend the HTML
          // Comment plugin to also respect and process <br/> tags. Then I
          // can avoid transforming them, and use the \\ sentinel to
          // separate multi-line table cells.
          $cell_content_lines = explode('\\\\', $cell_content);
          foreach ($cell_content_lines as $cell_content)
          {
            // - watch out, as the content may be an image
            if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content))
            {
              processImage($xml_out, $cell_content);
            }
            elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches))
            {
              fwrite($xml_out, '<CodeLine>' . translateTableCodeline($matches[1]) . '</CodeLine>' . "\n");
            }
            // - anything else it text
            else
            {
              outputTextBlock($xml_out, $cell_content);
            }
          }
          fwrite($xml_out, '</th>' . "\n");
        }
      }
      fwrite($xml_out, '</tr>' . "\n");
    }
    // - links to image media in the wiki!
    elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line))
    {
      processImage($xml_out, $line);
      // - if we were processing a figure, then now is a good time to close it
      if ($in_figure)
      {
        fwrite($xml_out, '</Figure>' . "\n");
        $in_figure = false;
      }
    }
    // - if the line starts with a <code> block, then we have a tag
    //   for that (which is special in that it get a unique text id)
    elseif (preg_match('/^<code\s*\d*\s*>(.*?)(<\/code>)?$/', $line, $matches) || ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches)))
    {
      $payload = $matches[1];
      $found_end = (isset($matches[2]));
      $in_code_block = true;
      // - be careful with empty lines
      if (empty($payload))
      {
        // - as they may appear in the body of the code (in which case we need
        //   to output them). The empty lines at the start or end of a code
        //   block are just an unfortunate consequence of the support for code
        //   line numbering.
        if (!$found_end && strpos($line, '<code') === false)
        {
          fwrite($xml_out, "<CodeLine/>\n");
        }
      }
      elseif (preg_match('/^<!-- id:([^\s]+) -->/', $payload, $matches))
      {
        $text_id = $matches[1];
        outputTextBlock($xml_out, $payload, 'code', true);
        // - record the id to prevent repeating
        $seen_ids[$text_id] = true;
      }
      else
      {
        fwrite($xml_out, '<CodeLine>' . translateText($payload, true) . '</CodeLine>' . "\n");
      }
      // - if we didn't find an endtag we have to keep doing code mode until
      //   we do
      $in_code_block = (!$found_end);
      if ($found_end)
      {
        // - if we were processing a figure, then now is a good time to close it
        if ($in_figure)
        {
          fwrite($xml_out, '</Figure>' . "\n");
          $in_figure = false;
        }
      }
    }
    // - entities on a line by themselves (i.e. references to external files)
    //   go through verbatim
    elseif (preg_match('/^\s*&[a-z0-9_-]+;\s*$/', $line))
    {
      fwrite($xml_out, $line . "\n");
    }
    // - lines starting with > are indented text blocks
    elseif (preg_match('/^>(.*)$/', $line, $matches))
    {
      $payload = $matches[1];
      fwrite($xml_out, "<Indented>\n");
      outputTextBlock($xml_out, $payload);
      fwrite($xml_out, "</Indented>\n");
    }
    // - everything else goes straight through as a text block
    // - note that for code blocks, even empty lines count
    elseif (!empty($line))
    {
      // - output the line of text having encoded entities etc
      outputTextBlock($xml_out, $line, '', $in_code_block);
    }
  }
  // Complete any open part
  if ($in_part)
  {
    fwrite($xml_out, '</Content>' . "\n");
    fwrite($xml_out, '</Part>' . "\n");
    $in_part = false;
  }
  // Complete any open subsection
  if ($in_subsection)
  {
    fwrite($xml_out, '</Content>' . "\n");
    fwrite($xml_out, '</Subsection>' . "\n");
    $in_subsection = false;
  }
  // Complete any open section
  if ($in_section)
  {
    fwrite($xml_out, '</Content>' . "\n");
    fwrite($xml_out, '</Section>' . "\n");
    $in_section = false;
  }
  // Complete any open chapter
  if ($in_chapter)
  {
    fwrite($xml_out, '</Content>' . "\n");
    fwrite($xml_out, '</Chapter>' . "\n");
    $in_chapter = false;
  }
}
/** processPage($xml_out, $page_name) **/

function processImage($xml_out, $text)
{
  global $dokuwiki_path;
  global $xml_source_path;
  if (preg_match('/\{\{.+?([^:?]+)\?(\d+)x(\d+)(&direct)?\}\}/', $text, $matches))
  {
    $filename = $matches[1];
    $width = $matches[2];
    $height = $matches[3];
    // - copy the file into place
    $image_source_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
    $image_destination_dir = $xml_source_path . '/' . $_REQUEST['l'] . '/images';
    mkAllDir($image_destination_dir);
    $image_destination_path = $image_destination_dir . '/' . $filename;
    if (copy($image_source_path, $image_destination_path))
    {
      echo '[copying file: ' . $filename . "] \n";
      chmod($image_destination_path, 0664);
    }
    else
    {
      printError('Failed to copy image into place: ' . $filename, false);
    }
    // - spit out the XML element
    fwrite($xml_out, '<File width="' . $width . '" height="' . $height . '" url="images/' . $filename . '"/>' . "\n");
  }
}