in code blocks, while something like <Metadata> is // not. Set this to true to allow such abominations (for the purpose of // comparing before and after versions of the XML). [jmt12] $allow_bad_codeblocks = true; /** @file gs-manual-export.php * This script transforms the series of dokuwiki pages that make up a certain * manual (as specified by the 'm' argument) in a certain language ('l') into * the XML format required by the rest of the Greenstone manual generation * scripts. */ // 0. Initialization // - we have a counter to assign identifiers to text blocks etc without ids $text_id_counter = 1; // - we need an array of all the footnotes $footnotes = array(); // Defaults if (!isset($_REQUEST['l']) || empty($_REQUEST['l'])) { $_REQUEST['l'] = 'en'; } if (!isset($_REQUEST['m']) || empty($_REQUEST['m'])) { //$_REQUEST['m'] = 'user'; //$_REQUEST['m'] = 'install'; $_REQUEST['m'] = 'develop'; //$_REQUEST['m'] = 'paper'; } if (!isset($_REQUEST['v']) || empty($_REQUEST['v'])) { $_REQUEST['v'] = 'draft'; } if (!isset($_REQUEST['a']) || !preg_match('/^(download|store)$/', $_REQUEST['a'])) { $_REQUEST['a'] = 'store'; // Try to store the file to disk } $var_path = filecat(array($base_path, 'var')); $timestamp = time(); //$xml_source_path = '**PATH TO GSDL MANUALS**' $xml_source_path = '/tmp'; if ($_REQUEST['a'] == 'download') { // Clear out previous exports recursiveRemove($var_path, '/greenstone/greenstone-documentation/php/var'); // New export $xml_source_path = fileCat(array($var_path, $timestamp)); } echo '' . "\n"; echo '
' . "\n"; echo 'Manual: ' . $_REQUEST['m'] . ' Language: ' . $_REQUEST['l'] . "
\nFrontmatter: [Debug] metadata: " . print_r($cover_metadata, true) . "
\n";
// - by reading this page we hope to populate an array of metadata, and also
// extract the sequence of other pages within this manual
$cover_metadata = array();
$pages_in_order = array();
// - we now need to consider if the user has asked for a draft version (i.e.
// includes the latest version of pages regardless of approval) or if only
// the approved versions of pages should be included
// - only necessary for english version of manual, as those are the only pages
// editable
$top_page_path = '';
if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
{
// - again, we can construct the path to the top level page given the arguments
// provided
$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
}
else
{
$top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']);
}
if (!file_exists($top_page_path))
{
printError('Failed to locate top level page for manual');
}
// - we're going to open a handle to the file, then read it in line-by-line
// watching for the lines we are interested in (using pattern matching)
$top_page_in = fopen($top_page_path, 'r');
if (!$top_page_in)
{
printError('Failed to open top level page for reading');
}
$in_contents = false;
while (($line = fgets($top_page_in)) !== false)
{
// - if we are capturing page order, and we encounter something that looks
// like a bulletpoint item pointing to a wiki page, then we append the
// name of that page to our pages in order array
if ($in_contents && preg_match('/^\s+\*\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\|(.*?)\]\]\s*$/', $line, $matches))
{
array_push($pages_in_order, $matches[1]);
}
// - metadata is all encoded within dokuwiki tables
elseif (preg_match('/^\^\s+([^\s]+)\s+\|\s+(.+?)\s+\|\s*$/', $line, $matches))
{
$field = $matches[1];
$value = $matches[2];
$values = array();
if (isset($cover_metadata[$field]))
{
$values = $cover_metadata[$field];
}
array_push($values, $value);
$cover_metadata[$field] = $values;
}
// - watch for the heading 'Contents' to begin extracting page order
// information
elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches))
{
if ($matches[1] == 'Contents')
{
$in_contents = true;
}
// - any other title means we aren't capturing page order (anymore)
else
{
$in_contents = false;
}
}
}
if (!feof($top_page_in))
{
printError('Unexpected fgets() fail when reading top page');
}
fclose($top_page_in);
// - ensure we have the required metadata
$required_metadata = array('Heading','Title','Affiliation','Version','Date');
foreach ($required_metadata as $required_field)
{
if (!isset($cover_metadata[$required_field]))
{
printError('Missing required metadata: ' . $required_field);
}
}
// - now we can use the metadata to construct the XML header and the cover page.
// This follows a pretty set recipe with only elements that can repeat---like
// Author, SupplementaryText etc---are at all tricky
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
}
}
fwrite($xml_out, ']>' . "\n");
fwrite($xml_out, '
'. "\n";
outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id);
}
fwrite($xml_out, '
Complete!
' . "\n[DEBUG] zip_command:' . $zip_command . '
'; $tgz_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.tgz'; $tgz_path = fileCat(array($xml_source_path, $tgz_file)); $tgz_command = 'tar -czf "' . $tgz_path . '" * > /dev/null 2>&1'; //cho '[DEBUG] tgz_command:' . $tgz_command . '
'; // We need to move to the document folder so that archives have sensible paths $original_cwd = getcwd(); chdir($xml_file_dir); system($zip_command); system($tgz_command); // Go back chdir($original_cwd); // Links are ready echo 'Download XML file plus images as: ZIP or TGZ
' . "\n"; } echo 'Click here to return to dokuwiki
' . "\n"; echo '' . "\n"; echo ''; exit(0); /** */ function outputMetadataSingle($xml_out, $metadata, $field, $mid=false) { echo '[metadata: ' . $field . "] \n"; if ($mid) { fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n"); } else { fwrite($xml_out, '<' . $field . '>' . "\n"); } if (is_array($metadata)) { if (isset($metadata[$field]) && isset($metadata[$field][0])) { outputTextBlock($xml_out, $metadata[$field][0]); } else { echo 'no such field or no metadata'; } } elseif (!empty($metadata)) { outputTextBlock($xml_out, $metadata); } else { echo 'no such field or no metadata'; } fwrite($xml_out, '' . $field . '>' . "\n"); } /** outputMetadataSingle() **/ function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false) { echo '[metadata list: ' . $field . "] \n"; fwrite($xml_out, '<' . $field . '>' . "\n"); if (isset($metadata[$field])) { if (count($metadata[$field]) == 1) { outputTextBlock($xml_out, $metadata[$field][0]); } if (count($metadata[$field]) > 1) { $last_value = ''; if ($final_separator) { $last_value = array_pop($metadata[$field]); } $values = implode($separator, $metadata[$field]); if ($final_separator) { $values .= $final_separator . $last_value; } outputTextBlock($xml_out, $values); } } else { echo 'no such field or no metadata'; } fwrite($xml_out, '' . $field . '>' . "\n"); } /** outputMetadataList() **/ function outputMetadataMultiple($xml_out, $metadata, $field) { echo '[metadata multiple: ' . $field . "] \n"; // - Text blocks don't need to be wrapped in Text element if ($field != 'Text') { fwrite($xml_out, '<' . $field . '>' . "\n"); } if (isset($metadata[$field])) { foreach ($metadata[$field] as $value) { outputTextBlock($xml_out, $value); } } else { echo 'no such field or no metadata'; } if ($field != 'Text') { fwrite($xml_out, '' . $field . '>' . "\n"); } } function translateTableCodeline($text) { // Escape any italic tags hidden in HTML comments $text = str_replace('', '%!--i--%', $text); $text = str_replace('', '%!--/i--%', $text); // Encode entities etc $text = translateText($text, true); // Restore any italics elements hidden above $text = str_replace('%!--i--%', '', $text); $text = str_replace('%!--/i--%', '', $text); return $text; } /** translateTableCodeline() **/ function translateText($text, $in_code_block=false) { global $allow_bad_codeblocks; $text = str_replace('&','&',$text); $text = str_replace('<','<',$text); $text = str_replace('>','>',$text); if ($in_code_block && $allow_bad_codeblocks) { ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n"; $text = str_replace('<i>','',$text); $text = str_replace('</i>','',$text); $text = str_replace('<br/>','Export Chapter: " . $page_name . "
\n";
// - locate the page in question (taking into account if the user asked for a
// draft version or an approved version of the manual)
$page_path = '';
if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
{
$page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt';
}
else
{
$page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name);
}
$page_in = @fopen($page_path, 'r');
if (!$page_in)
{
printError('Failed to open page for reading:' . $page_name, false);
return;
}
// - once again we read in line-by-line, but this time we are going to output
// each line as we go through. We expect to encounter certain lines in a
// predefined order, and should complain if we don't find what we expect.
$in_chapter = false;
$in_section = false;
$in_subsection = false;
$in_part = false;
$in_list = false;
$lists = array();
$previous_listitem_type = '';
$in_figure = false;
$in_table = false;
$column_widths = array();
$in_code_block = false;
while (($line = fgets($page_in)) !== false)
{
// remove newline character
$line = preg_replace('/\r?\n$/','',$line);
// - we need to know the 'depth' for the bulletpoint lists
$depth = 0;
while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == ' ')
{
$depth++;
$line = substr($line, 2);
}
$first_character = substr($line, 0, 1);
// - special case for the end of bullet lists
if ($in_list && ($first_character != "*" && $first_character != "-"))
{
while (count($lists) > 0)
{
$list_type = array_pop($lists);
if ($list_type == '*')
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
}
else
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
}
}
$in_list = false;
}
// - special case for the end of tables
if ($in_table && $first_character != '^' && $first_character != '|')
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_table = false;
}
// - special cases for premature closing of sections, subsections and parts
if (preg_match('//', $line, $matches))
{
// - we always try to do this (regardless of actual flag) as we must
// always close the smallest 'granularity' first
if ($in_part)
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_part = false;
}
if ($in_subsection && ($matches[1] == 'section' || $matches[1] == 'subsection'))
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_subsection = false;
}
if ($in_section && $matches[1] == 'section')
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_section = false;
}
}
// - if this page is a chapter, then the first thing on the page should be
// the chapter title (six equals)
if (preg_match('/====== (.+) ======/', $line, $matches))
{
$chapter_title = $matches[1];
$chapter_id = $page_name;
if (empty($chapter_id))
{
$chapter_id = generateID($chapter_title);
}
// - are we already processing a part? if so end it, end it now
if ($in_part)
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_part = false;
}
// - are we already processing a subsection? if so end it, end it now
if ($in_subsection)
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_subsection = false;
}
// - are we already processing a section? if so end it, end it now
if ($in_section)
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_section = false;
}
// - are we already processing a chapter? if so end it, end it now
if ($in_chapter)
{
fwrite($xml_out, '' . "\n");
fwrite($xml_out, '' . "\n");
$in_chapter = false;
}
// - write out this chapter's header
fwrite($xml_out, '\s*$/', $list_text, $matches))
{
$list_text = $matches[1];
$in_code_block = true;
}
outputTextBlock($xml_out, $list_text);
// - to make things clearer, we'll process any and all code blocks within
// bullets here - especially as there may be more text block *after*
// the code block finishes
if ($in_code_block)
{
$sub_line = '';
while ($in_code_block && ($sub_line = fgets($page_in)) !== false)
{
$sub_line = trim($sub_line);
// - closing code
if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches))
{
$sub_line = $matches[1]; // may be empty string
$in_code_block = false;
}
// - output another plain codeline
else
{
fwrite($xml_out, '
\n");
}
else
{
fwrite($xml_out, '
' . "\n");
}
fwrite($xml_out, '
\n");
}
else
{
fwrite($xml_out, '
' . "\n");
}
fwrite($xml_out, '
\n");
}
else
{
fwrite($xml_out, '
' . "\n");
}
fwrite($xml_out, '
' . "\n");
foreach ($cell_contents as $index=>$cell_content)
{
$cell_content = trim($cell_content);
$th_text = '';
if (isset($column_widths[$index]))
{
$th_text = ' ' . "\n");
}
// - links to image media in the wiki!
elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line))
{
processImage($xml_out, $line);
// - if we were processing a figure, then now is a good time to close it
if ($in_figure)
{
fwrite($xml_out, '' . "\n");
$in_figure = false;
}
}
// - if the line starts with a to also be split up. I
// think the only way forward would be to maybe extend the HTML
// Comment plugin to also respect and process
tags. Then I
// can avoid transforming them, and use the \\ sentinel to
// separate multi-line table cells.
$cell_content_lines = explode('\\\\', $cell_content);
foreach ($cell_content_lines as $cell_content)
{
// - watch out, as the content may be an image
if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content))
{
processImage($xml_out, $cell_content);
}
elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches))
{
fwrite($xml_out, ' block, then we have a tag
// for that (which is special in that it get a unique text id)
elseif (preg_match('/^
(.*?)(<\/code>)?$/', $line, $matches) || ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches)))
{
$payload = $matches[1];
$found_end = (isset($matches[2]));
$in_code_block = true;
// - be careful with empty lines
if (empty($payload))
{
// - as they may appear in the body of the code (in which case we need
// to output them). The empty lines at the start or end of a code
// block are just an unfortunate consequence of the support for code
// line numbering.
if (!$found_end && strpos($line, '
\n");
}
}
elseif (preg_match('/^/', $payload, $matches))
{
$text_id = $matches[1];
outputTextBlock($xml_out, $payload, 'code', true);
// - record the id to prevent repeating
$seen_ids[$text_id] = true;
}
else
{
fwrite($xml_out, '