Importing Greenstone Manual XML' . "\n"; echo '
XML Source Path: ' . $xml_source_path . '
Manual: ' . $_REQUEST['m'] . '
Language: ' . $_REQUEST['l'] . "
Frontmatter:
\n";
// 1. By-and-large we're going to process all of this in a big state machine
// - the top level page, containing cover page and chapter order information,
// needs to be created last, so we have to store it's information
$manual_metadata = array();
$entity_replacements = array();
$footnotes = array();
$page_order = array();
$page_count = 2;
$looking_for_metadata = '';
$chapter_txt_out = false;
$frontmatter_text = '';
$in_section = false;
$sections_page_name = '';
$in_chapter = false;
$chapter_id = '';
$bullet_depth = 0;
$is_numbered_list = true;
$line_counter = 0;
$in_code = false;
$in_footnotes = false;
$in_numbered_item = 0;
$in_bullet_item = false;
$seen_code_in_item = false;
$in_indent = false;
$is_code_linenumbered = false;
// - construct the path using the information we've been provided as arguments
$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
$xml_in = fopen($xml_file_path, 'r');
if (!$xml_in)
{
printError('Failed to locate top level page for manual');
}
// - we also use this opportunity to read in any footnotes as we'll need to
// move them onto their appropriate page
while (($line = fgets($xml_in)) !== false)
{
if (preg_match('/
Import Chapter:" . $chapter_id . '
' . "\n";
$chapter_page_name = $chapter_id;
// - create a new file to store this chapter
$chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
if (!file_exists($chapter_file_dir))
{
mkAllDir($chapter_file_dir, 0755);
}
$chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
// - backup existing file
if (file_exists($chapter_file_path))
{
$chapter_backup_file_path = $chapter_file_path . '.bak';
rename($chapter_file_path, $chapter_backup_file_path);
}
// - open new file for writing
$chapter_txt_out = fopen($chapter_file_path, 'w');
if (!$chapter_txt_out)
{
printError('Failed to open page file for writing: ' . $chapter_page_name);
}
$in_chapter = true;
$title = getTitle($xml_in, 'chapter: ' . $chapter_id);
fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
array_push($page_order, $chapter_page_name . '|' . noComments($title));
}
elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
{
fclose($chapter_txt_out);
$chapter_txt_out = false;
$in_chapter = false;
$page_count++;
}
// - section, subsection and part titles within chapter
elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]*)">/', $line, $matches))
{
$title_type = $matches[1];
$section_id = $matches[2];
if (empty($section_id))
{
$section_id = generateID(strtolower($title_type));
}
echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
$header_fix = '';
$title = getTitle($xml_in, 'heading: ' . $title_type);
if ($title_type == 'Section')
{
$header_fix = '=====';
}
if ($title_type == 'Subsection')
{
$header_fix = '====';
}
if ($title_type == 'Part')
{
$header_fix = '===';
// - remove b's and i's
$title = preg_replace('/<\/?(B|I)>/i', '', $title);
}
// - if the title, as is, wouldn't autogenerate the appropriate id, then
// we have to include the id explicitly (as another html comment block)
if ($section_id != generateID($title))
{
$title = '' . $title;
$seen_ids[$section_id] = 1;
}
fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
}
elseif ($in_chapter && (strpos($line, '/', $line, $matches))
{
$figure_id = $matches[1];
$other_attributes = $matches[2];
echo '[adding figure: ' . $figure_id . "] \n";
// We need the title too
$caption = getTitle($xml_in, 'figure:' . $figure_id);
$caption = translateText(alternateComments($caption));
$txt = "
';
}
else
{
$code_prefix = '';
}
$in_code = true;
}
$code_txt = $code_prefix . '' . translateText($matches[2], true);
if ($in_chapter)
{
fwrite($chapter_txt_out, $code_txt);
}
else
{
$frontmatter_text .= $code_txt;
}
}
elseif (preg_match('/(.+?)<\/Text>/', $line, $matches))
{
$code_id = $matches[1];
// - determine the appropriate code block prefix
$code_prefix = '';
if (!$in_code)
{
if ($is_code_linenumbered)
{
$code_prefix = '';
}
else
{
$code_prefix = '';
}
$in_code = true;
}
$code_txt = $code_prefix . '' . translateText($matches[2], true);
if ($in_chapter)
{
fwrite($chapter_txt_out, $code_txt);
}
else
{
$frontmatter_text .= $code_txt;
}
}
elseif (preg_match('/(.*?)$/', $line, $matches))
{
$code_txt = $matches[1];
// - determine the appropriate code block prefix
$code_prefix = '';
if (!$in_code)
{
if ($is_code_linenumbered)
{
$code_prefix = "\n";
}
else
{
$code_prefix = "\n";
}
$in_code = true;
}
// - arg. another special case for codelines that span more than one line
// (but I guess is a bit cumbersome
// for an element name, eh?)
$another_line = '';
while (strpos($code_txt, '
') === false && ($another_line = getLine($xml_in)) !== false)
{
$code_txt .= ' ' . $another_line;
}
$code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
$code_txt = $code_prefix . translateText($code_txt, true);
if ($in_chapter)
{
fwrite($chapter_txt_out, $code_txt);
}
else
{
$frontmatter_text .= $code_txt;
}
}
// - there are also sometimes empty codelines - which indicate a newline in
// the code listing
elseif (preg_match('/ /', $line, $matches))
{
$code_txt = '';
if (!$in_code)
{
$code_txt = "\n";
if ($is_code_linenumbered)
{
$code_txt = "\n" . $code_txt;
}
else
{
$code_txt = "\n" . $code_txt;
}
$in_code = true;
}
if ($in_chapter)
{
fwrite($chapter_txt_out, $code_txt);
}
else
{
$frontmatter_text .= $code_txt;
}
}
// - reference to an external XML file
elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
{
if ($in_chapter)
{
fwrite($chapter_txt_out, $line);
}
else
{
$frontmatter_text .= $line;
}
}
elseif (strpos($line, '') !== false)
{
$in_footnotes = true;
}
elseif ($in_footnotes && strpos($line, ' ') !== false)
{
$in_footnotes = false;
}
// Indentation - the closest thing we have is quoting, so we'll use that
elseif (strpos($line, '') !== false)
{
$in_indent = true;
}
elseif (strpos($line, ' ') !== false)
{
$in_indent = false;
}
// - pattern of lines to ignore
else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>||<\/?Content>|<\/?Footnote||<\/Manual>)/', $line))
{
}
// - we ignore anything else in footnotes too, as they were handled in the
// preprocessing pass
else if ($in_footnotes)
{
}
// - ignore empty lines
else if (preg_match('/^\s*$/', $line))
{
}
// - meh. French versions have random, non-text element, linebreaks floating
// around. Guess I'll honor their formatting even though it's bogus
else if (preg_match('/^\s*
\s*$/', $line))
{
if ($in_chapter)
{
fwrite($chapter_txt_out, ' \\\\');
}
else
{
$frontmatter_text .= ' \\\\';
}
}
// - danger Will Robinson!
else
{
echo '
Warning! Failed to parse line ' . $line_counter . ': |' . htmlspecialchars($line) . "|
\n";
}
}
// 2. We should now have enough metadata to export the cover page
$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
// - backup any existing file
if (file_exists($top_page_path))
{
$top_page_backup_path = $top_page_path . '.bak';
if(!rename($top_page_path, $top_page_backup_path))
{
printError('Failed to rename existing top page for backup');
}
}
// - and create a handle to the new file
$txt_out = fopen($top_page_path, 'w');
// - write the page (including the tables)
fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
fwrite($txt_out, "\n");
// - *NEW* ability to request imports and exports from within the page
fwrite($txt_out, "\n\n");
fwrite($txt_out, '**Administrator Commands:**' . "\n");
// On second thoughts we probably never want to do this casually, as it boguses
// all history/approval/edit information. Instead I'll leave this as a manual
// process.
fwrite($txt_out, '\n");
fwrite($txt_out, ' * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download|approved version]]' . "\n");
fwrite($txt_out, " \n\n");
// - regular metadata
fwrite($txt_out, '' . "\n");
fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
foreach ($fields as $field)
{
$values = getMetadata($field);
foreach ($values as $value)
{
fwrite($txt_out, '^ ' . $field . ' | ' . $value . ' |' . "\n");
}
}
fwrite($txt_out, "\n");
// - contents (which also provides order information for exporting)
fwrite($txt_out, '===== Contents =====' . "\n");
fwrite($txt_out, "\n");
fwrite($txt_out, '' . "\n");
foreach ($page_order as $page_info)
{
fwrite($txt_out, ' * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
}
// - system metadata
fwrite($txt_out, "\n\n");
fwrite($txt_out, '===== System Metadata =====' . "\n");
fwrite($txt_out, '' . "\n");
fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
$fields = array('ENTITY','SupplementaryText');
foreach ($fields as $field)
{
$values = getMetadata($field);
foreach ($values as $value)
{
fwrite($txt_out, '^ ' . $field . ' | ' . $value . ' |' . "\n");
}
}
fwrite($txt_out, " \n\n");
fwrite($txt_out, "\n");
// - done!
fclose($txt_out);
$page_count++;
// 3. And the 'sections' page, grouping together all the loose sections as
// frontmatter
$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
// - backup any existing file
if (file_exists($frontmatter_page_path))
{
$frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
{
printError('Failed to rename existing frontmatter page for backup');
}
}
// - populate the new frontmatter file
file_put_contents($frontmatter_page_path, $frontmatter_text);
echo "\nComplete! Imported " . $page_count . " pages
\n";
echo 'Click here to return to wiki page
' . "\n";
exit(0);
/**
*/
function addMetadata($field, $value)
{
global $manual_metadata;
echo '[adding metadata: ' . $field . "] \n";
$values = array();
if (isset($manual_metadata[$field]))
{
$values = $manual_metadata[$field];
}
array_push($values, $value);
$manual_metadata[$field] = $values;
}
/** addMetadata() **/
function getFirstMetadata($field)
{
global $manual_metadata;
$value = '';
if (isset($manual_metadata[$field]))
{
$values = $manual_metadata[$field];
if (!empty($values))
{
$value = $values[0];
}
}
return $value;
}
/** getFirstMetadata() **/
function getMetadata($field)
{
global $manual_metadata;
$values = array();
if (isset($manual_metadata[$field]))
{
$values = $manual_metadata[$field];
}
return $values;
}
/** getMetadata() **/
/**
* Read in the next title element with nested text element and extract the
* title.
*/
function getTitle($xml_in, $element)
{
$title = '';
$in_title_element = false;
// - the first thing in a chapter will be it's title
$title_line = getLine($xml_in);
// - super special case: some language versions don't wrap titles in title
// element, so if the first thing we see is a text, we treat that as the
// title
if (strpos($title_line, ' ') !== false)
{
return '';
}
if (strpos($title_line, '') !== false)
{
$in_title_element = true;
$title_line = getLine($xml_in);
}
// - some horribly formed entries have the subtitle first within the title
// element
if (strpos($title_line, '') !== false)
{
$title_line = getLine($xml_in);
if (preg_match('/(.+?)<\/Text>/', $title_line, $matches))
{
$title = '' . $matches[2] . ' ' . $title;
}
$title_line = getLine($xml_in);
if (strpos($title_line, ' ') === false)
{
printError('Failed to find closing title for: ' . $element);
}
$title_line = getLine($xml_in);
}
// - grab the chapter title now so we can store it in the page ordering
if (preg_match('/(.*?)$/', $title_line, $matches))
{
$id = $matches[1];
$str = $matches[2];
// - special case for text blocks that span multiple lines (as discovered
// in the russian "From Paper"
$another_line = '';
while (strpos($str, ' ') === false && ($another_line = getLine($xml_in)) !== false)
{
$str .= ' ' . $another_line;
}
// - now remove
$str = preg_replace('/<\/Text>\s*/', '', $str);
$title = '' . $str . $title;
}
// - special case for (stoopid) empty titles that use up a text id
elseif (preg_match('/ /', $title_line, $matches))
{
$title = '' . $title;
}
else
{
printError('Failed to find title text for: ' . $element);
}
// - watch for subtitle elements
if ($in_title_element)
{
$title_line = getLine($xml_in);
if (strpos($title_line, '') !== false)
{
$title_line = getLine($xml_in);
if (preg_match('/(.+?)<\/Text>/', $title_line, $matches))
{
$title = '' . $matches[2] . ' ' . $title;
}
$title_line = getLine($xml_in);
if (strpos($title_line, ' ') === false)
{
printError('Failed to find closing title for: ' . $element);
}
$title_line = getLine($xml_in);
}
if (strpos($title_line, '') === false)
{
printError('Failed to find closing title for: ' . $element);
}
}
return $title;
}
/** getTitle() **/
function alternateComments($text)
{
$text = str_replace('', '--%', $text);
// remove any lurking crossrefs while we are at it
$text = preg_replace('//', '\\1', $text);
return $text;
}
function noComments($text)
{
$text = preg_replace('/(.*?)<\/i>/','//\1//',$text);
return preg_replace('//', '', $text);
}
function translateTableCodeline($text)
{
///cho "Debug: translateTableCodeLine('" . htmlspecialchars($text) . "')
\n";
// Escape the current italics tags to prevent the translate destroying them
$text = str_replace('', '%!--i--%', $text);
$text = str_replace('', '%/i%', $text);
// Translate the text, just decoding the entities
$text = translateText($text, true);
// Now turn the italic tags (escaped) into HTML comments so we remember them
// but they are hidden in the text
$text = str_replace('%!--i--%', '', $text);
$text = str_replace('%/i%', '', $text);
///cho " => '" . htmlspecialchars($text) . "
\n";
return $text;
}
function translateText($text, $entities_only=false)
{
global $entity_replacements;
global $footnotes;
global $in_code;
// - immediate find and protect any legitimate HTML comments in the text
// (so already using encoded entities), otherwise they'll be throughly
// vladed during the following tranforms. This has to be matched with
// changes to the HTMLComments plugin in Dokuwiki to allow the correct
// thing to be displayed to the user.
if (!$in_code)
{
$text = str_replace('<!--', '%!--', $text);
$text = str_replace('-->', '--%', $text);
}
if (!$entities_only)
{
// - replace linking constructs with dokuwiki ones
// - external chapter section crossrefs are easily the worst of all...
while (preg_match('/]*external[^>]*\/>/', $text) && preg_match('/]*target="Chapter"[^>]*\/>/', $text) && preg_match('/ /', $text, $matches))
{
$pattern = $matches[0];
$attributes = $matches[1];
$manual_name = '';
if (preg_match('/external="([^"]+)"/', $attributes, $matches))
{
$manual_name = $matches[1];
}
$language = '';
if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
{
$language = $matches[1];
}
$page_id = '';
if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
{
$page_id = $matches[1];
}
if (empty($manual_name) || empty($language) || empty($page_id))
{
printError('Failed to parse external reference: ' . $pattern);
}
// - best we can do is a search within a restricted namespace
$reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '|' . $page_id . ']]';
$text = str_replace($pattern, $reference, $text);
}
// - chapter crossrefs are tricksie due to needing to know ordering numbers
while (preg_match('/ /', $text, $matches))
{
$chapter_id = $matches[1];
$page_name = $chapter_id;
$text = preg_replace('/ /', '[[.:' . $page_name . '|' . $chapter_id . ']]', $text);
}
// - internal figure and table references
$text = preg_replace('/ /','', $text);
$text = preg_replace('/ /','', $text);
// - simple internal reference
$text = preg_replace('/ /', '[[#\1|\1]]', $text);
$text = preg_replace('/ /', '[[##\1|\1]]', $text);
$text = preg_replace('/ /', '[[###\1|\1]]', $text);
// - simple external url
$text = preg_replace('/(.+?)<\/Link>/', '[[\1|\2]]', $text);
// - footnote references are also tricksie as we've had to extract the
// footnotes earlier (during chapter counting)
while (preg_match('/ /', $text, $matches))
{
$footnote_id = $matches[1];
if (!isset($footnotes[$footnote_id]))
{
printError('Unknown footnote referenced: ' . $footnote_id);
}
$footnote = $footnotes[$footnote_id];
$text = preg_replace('/ /', '((' . $footnote . '))', $text);
}
// - detect and handle URLs surrounded by tags very carefully (as doku
// will less than helpfully turn them into an external link and screw up
// everything that follows them on the page).
// example: www.microsoft.com
// example: http://nzdl.org/cgi-bin/library
// example: www.yourserver.com
// example: http://www.yourserver.com
// example: http://www.yourserver.com/greenstone
$text = preg_replace('/((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
// - superspecial case for two-part URLs ending in .org (like nzdl.org)
$text = preg_replace('/((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
// - another superspecial case, this time for URLs on localhost
$text = preg_replace('/((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
// - p00p, underlines have the same issue around URLs.
$text = preg_replace('/((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
$text = preg_replace('/((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
$text = preg_replace('/((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
// - replace HTML elements with the dokuwiki style equivilents
$text = preg_replace('/(file|ftp|http):\/\//','\1:%%//%%', $text);
// - restore the double slashes in dokuwiki links
while (preg_match('/\[\[[^\]]*%%[^\]]*\]\]/', $text))
{
$text = preg_replace('/(\[\[[^\]]*)%%([^\]]*\]\])/', '\1\2', $text);
}
$text = str_replace('', '**', $text);
$text = str_replace('', '**', $text);
//$text = str_replace('
', '\\\\ ', $text);
//$text = str_replace('
', '\\\\ ', $text);
//$text = str_replace('
', '\\\\ ', $text);
$text = str_replace('', '//', $text);
$text = str_replace('', '//', $text);
$text = str_replace('', '__', $text);
$text = str_replace('', '__', $text);
}
// Decode entities
// - user defined entities (in the manual metadata)
foreach ($entity_replacements as $entity=>$code)
{
$text = str_replace('&' . $entity . ';', html_entity_decode(''.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
}
// - standard entities
$text = str_replace('>','>', $text);
$text = str_replace('<','<', $text);
$text = str_replace('&','&', $text);
return $text;
}
/** translateText() **/
function handleImage($filename, $width, $height)
{
global $dokuwiki_path;
global $xml_source_path;
echo '[copying image: ' . $filename . "] \n";
// - copy file into place
$source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
$destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
if (!file_exists($destination_dir))
{
mkAllDir($destination_dir, 0755);
}
$destination_path = $destination_dir . strtolower($filename);
copy($source_path, $destination_path);
if (!file_exists($destination_path))
{
printError('Failed to copy image file: ' . $filename);
}
// - create the string
$image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
return $image_txt;
}
function getLine($in)
{
global $line_counter;
$line_counter++;
return fgets($in);
}
function noFormatting($text)
{
$text = str_replace( '', '', $text);
$text = str_replace('', '', $text);
$text = str_replace( '', '', $text);
$text = str_replace('', '', $text);
$text = str_replace('
', '', $text);
return $text;
}
?>