<?php require_once('common.php'); $debug = 1; // There are some elements that are, in my opinion, incorrectly 'resolved' in // code blocks just because they are valid HTML. For instance, <i> is // resolved to <i> in code blocks, while something like <Metadata> is // not. Set this to true to allow such abominations (for the purpose of // comparing before and after versions of the XML). [jmt12] $allow_bad_codeblocks = true; /** @file gs-manual-export.php * This script transforms the series of dokuwiki pages that make up a certain * manual (as specified by the 'm' argument) in a certain language ('l') into * the XML format required by the rest of the Greenstone manual generation * scripts. */ // 0. Initialization // - we have a counter to assign identifiers to text blocks etc without ids $text_id_counter = 1; // - we need an array of all the footnotes $footnotes = array(); // Defaults if (!isset($_REQUEST['l']) || empty($_REQUEST['l'])) { $_REQUEST['l'] = 'en'; } if (!isset($_REQUEST['m']) || empty($_REQUEST['m'])) { //$_REQUEST['m'] = 'user'; //$_REQUEST['m'] = 'install'; $_REQUEST['m'] = 'develop'; //$_REQUEST['m'] = 'paper'; } if (!isset($_REQUEST['v']) || empty($_REQUEST['v'])) { $_REQUEST['v'] = 'draft'; } if (!isset($_REQUEST['a']) || !preg_match('/^(download|store)$/', $_REQUEST['a'])) { $_REQUEST['a'] = 'store'; // Try to store the file to disk } $var_path = filecat(array($base_path, 'var')); $timestamp = time(); //$xml_source_path = '**PATH TO GSDL MANUALS**' $xml_source_path = '/tmp'; if ($_REQUEST['a'] == 'download') { // Clear out previous exports recursiveRemove($var_path, '/greenstone/greenstone-documentation/php/var'); // New export $xml_source_path = fileCat(array($var_path, $timestamp)); } echo '<html>' . "\n"; echo '<head>' . "\n"; echo '<title>GS Manual Export</title>' . "\n"; echo '</head>' . "\n"; echo '<body>' . "\n"; // - validate arguments before we use them (security) if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m'])) { printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m'])); } if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l'])) { printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l'])); } echo '<h2>Generating Greenstone Manual XML</h2>' . "\n"; echo '<p><b>Manual:</b> ' . $_REQUEST['m'] . ' <b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n"; // 1. Create the XML output file handle // - construct the path using the information we've been provided as arguments $xml_file_dir = $xml_source_path . '/' . $_REQUEST['l']; mkAllDir($xml_file_dir); $xml_file_path = $xml_file_dir . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml'; // - backup any existing file if (file_exists($xml_file_path)) { $xml_backup_file_path = $xml_file_path . '.bak'; if (!rename($xml_file_path, $xml_backup_file_path)) { printError('Failed to rename existing manual file for backup'); } } // - and create a handle to the new file $xml_out = fopen($xml_file_path, 'w'); // 2. Read in the top level page - this will give configuration data for the // manual and cover page, as well as specifying the order for the other // pages in the manual echo "<p><b>Frontmatter:</b><br/>\n"; // - by reading this page we hope to populate an array of metadata, and also // extract the sequence of other pages within this manual $cover_metadata = array(); $pages_in_order = array(); // - we now need to consider if the user has asked for a draft version (i.e. // includes the latest version of pages regardless of approval) or if only // the approved versions of pages should be included // - only necessary for english version of manual, as those are the only pages // editable $top_page_path = ''; if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en') { // - again, we can construct the path to the top level page given the arguments // provided $top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt'; } else { $top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']); } if (!file_exists($top_page_path)) { printError('Failed to locate top level page for manual'); } // - we're going to open a handle to the file, then read it in line-by-line // watching for the lines we are interested in (using pattern matching) $top_page_in = fopen($top_page_path, 'r'); if (!$top_page_in) { printError('Failed to open top level page for reading'); } $in_contents = false; while (($line = fgets($top_page_in)) !== false) { // - if we are capturing page order, and we encounter something that looks // like a bulletpoint item pointing to a wiki page, then we append the // name of that page to our pages in order array if ($in_contents && preg_match('/^\s+\*\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\|(.*?)\]\]\s*$/', $line, $matches)) { array_push($pages_in_order, $matches[1]); } // - metadata is all encoded within dokuwiki tables elseif (preg_match('/^\^\s+([^\s]+)\s+\|\s+(.+?)\s+\|\s*$/', $line, $matches)) { $field = $matches[1]; $value = $matches[2]; $values = array(); if (isset($cover_metadata[$field])) { $values = $cover_metadata[$field]; } array_push($values, $value); $cover_metadata[$field] = $values; } // - watch for the heading 'Contents' to begin extracting page order // information elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches)) { if ($matches[1] == 'Contents') { $in_contents = true; } // - any other title means we aren't capturing page order (anymore) else { $in_contents = false; } } } if (!feof($top_page_in)) { printError('Unexpected fgets() fail when reading top page'); } fclose($top_page_in); // - ensure we have the required metadata $required_metadata = array('Heading','Title','Affiliation','Version','Date'); foreach ($required_metadata as $required_field) { if (!isset($cover_metadata[$required_field])) { printError('Missing required metadata: ' . $required_field); } } // - now we can use the metadata to construct the XML header and the cover page. // This follows a pretty set recipe with only elements that can repeat---like // Author, SupplementaryText etc---are at all tricky fwrite($xml_out, '<?xml version="1.0" encoding="UTF-8"?>' . "\n"); fwrite($xml_out, '<!DOCTYPE Manual [' . "\n"); if (isset($cover_metadata['ENTITY'])) { foreach ($cover_metadata['ENTITY'] as $entity) { fwrite($xml_out, "\t" . '<!ENTITY ' . $entity . '>' . "\n"); } } fwrite($xml_out, ']>' . "\n"); fwrite($xml_out, '<Manual id="' . ucfirst($_REQUEST['m']) . '" lang="' . $_REQUEST['l'] . '">' . "\n"); ///cho "<p>[Debug] metadata: " . print_r($cover_metadata, true) . "</p>\n\n"; outputMetadataSingle($xml_out, $cover_metadata, 'Heading'); outputMetadataSingle($xml_out, $cover_metadata, 'Title'); outputMetadataSingle($xml_out, $cover_metadata, 'Author'); outputMetadataSingle($xml_out, $cover_metadata, 'Affiliation'); outputMetadataMultiple($xml_out, $cover_metadata, 'SupplementaryText'); outputMetadataMultiple($xml_out, $cover_metadata, 'Text'); outputMetadataMultiple($xml_out, $cover_metadata, 'Comment'); outputMetadataSingle($xml_out, $cover_metadata, 'Version'); outputMetadataSingle($xml_out, $cover_metadata, 'Date'); // 3. Process each page listed in the contents of the top level page in order foreach ($pages_in_order as $page) { processPage($xml_out, $page); } // 4. Output out list of footnotes (if any) if (!empty($footnotes)) { fwrite($xml_out, '<FootnoteList>'. "\n"); foreach ($footnotes as $footnote=>$footnote_id) { ///cho '[debug] footnotes: (' . $footnote_id . ') ' . $footnote . '<br />'. "\n"; outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id); } fwrite($xml_out, '</FootnoteList>'. "\n"); } // 5. Finalize and close the XML output fwrite($xml_out, '</Manual>' . "\n"); fclose($xml_out); chmod($xml_file_path, 0664); // 6. Complete! echo '<p><b>Complete!</b></p>' . "\n<hr/>\n"; if ($_REQUEST['a'] == 'download') { // Zip up the manual files $zip_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.zip'; $zip_path = fileCat(array($xml_source_path, $zip_file)); $zip_command = 'zip -r "' . $zip_path . '" . > /dev/null 2>&1'; //cho '<p><b>[DEBUG]</b> zip_command:' . $zip_command . '</p>'; $tgz_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.tgz'; $tgz_path = fileCat(array($xml_source_path, $tgz_file)); $tgz_command = 'tar -czf "' . $tgz_path . '" * > /dev/null 2>&1'; //cho '<p><b>[DEBUG]</b> tgz_command:' . $tgz_command . '</p>'; // We need to move to the document folder so that archives have sensible paths $original_cwd = getcwd(); chdir($xml_file_dir); system($zip_command); system($tgz_command); // Go back chdir($original_cwd); // Links are ready echo '<p>Download XML file plus images as: <a href="var/' . $timestamp . '/' . $zip_file . '">ZIP</a> or <a href="var/' . $timestamp . '/' . $tgz_file . '">TGZ</a></p>' . "\n"; } echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to dokuwiki</p>' . "\n"; echo '</body>' . "\n"; echo '</html>'; exit(0); /** */ function outputMetadataSingle($xml_out, $metadata, $field, $mid=false) { echo '[metadata: ' . $field . "] \n"; if ($mid) { fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n"); } else { fwrite($xml_out, '<' . $field . '>' . "\n"); } if (is_array($metadata)) { if (isset($metadata[$field]) && isset($metadata[$field][0])) { outputTextBlock($xml_out, $metadata[$field][0]); } else { echo 'no such field or no metadata'; } } elseif (!empty($metadata)) { outputTextBlock($xml_out, $metadata); } else { echo 'no such field or no metadata'; } fwrite($xml_out, '</' . $field . '>' . "\n"); } /** outputMetadataSingle() **/ function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false) { echo '[metadata list: ' . $field . "] \n"; fwrite($xml_out, '<' . $field . '>' . "\n"); if (isset($metadata[$field])) { if (count($metadata[$field]) == 1) { outputTextBlock($xml_out, $metadata[$field][0]); } if (count($metadata[$field]) > 1) { $last_value = ''; if ($final_separator) { $last_value = array_pop($metadata[$field]); } $values = implode($separator, $metadata[$field]); if ($final_separator) { $values .= $final_separator . $last_value; } outputTextBlock($xml_out, $values); } } else { echo 'no such field or no metadata'; } fwrite($xml_out, '</' . $field . '>' . "\n"); } /** outputMetadataList() **/ function outputMetadataMultiple($xml_out, $metadata, $field) { echo '[metadata multiple: ' . $field . "] \n"; // - Text blocks don't need to be wrapped in Text element if ($field != 'Text') { fwrite($xml_out, '<' . $field . '>' . "\n"); } if (isset($metadata[$field])) { foreach ($metadata[$field] as $value) { outputTextBlock($xml_out, $value); } } else { echo 'no such field or no metadata'; } if ($field != 'Text') { fwrite($xml_out, '</' . $field . '>' . "\n"); } } function translateTableCodeline($text) { // Escape any italic tags hidden in HTML comments $text = str_replace('<!--i-->', '%!--i--%', $text); $text = str_replace('<!--/i-->', '%!--/i--%', $text); // Encode entities etc $text = translateText($text, true); // Restore any italics elements hidden above $text = str_replace('%!--i--%', '<i>', $text); $text = str_replace('%!--/i--%', '</i>', $text); return $text; } /** translateTableCodeline() **/ function translateText($text, $in_code_block=false) { global $allow_bad_codeblocks; $text = str_replace('&','&',$text); $text = str_replace('<','<',$text); $text = str_replace('>','>',$text); if ($in_code_block && $allow_bad_codeblocks) { ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n"; $text = str_replace('<i>','<i>',$text); $text = str_replace('</i>','</i>',$text); $text = str_replace('<br/>','<br/>',$text); } return $text; } function outputTextBlock($xml_out, $text, $type='', $in_code_block = false) { global $cover_metadata; global $text_id_counter; global $footnotes; global $allow_bad_codeblocks; // - Start by dealing with any footnotes before anything else while (preg_match('/\(\((.*?)\)\)/', $text, $matches)) { $pattern = $matches[0]; $footnote = $matches[1]; $footnote_id = count($footnotes) + 1; $footnotes[$footnote] = $footnote_id; // - note that we have to escape the footnote reference as the following // code will convert any < and > to entities... $footnote_reference = '%FootnoteRef id="' . $footnote_id . '"/%'; $text = str_replace($pattern, $footnote_reference, $text); } $text_id = ''; // - check whether the string begins with an explicit id if (preg_match('/^\s*<!--\s*id:(.+?)\s*-->(.*)$/', $text, $matches)) { $text_id = $matches[1]; $text = $matches[2]; if (is_numeric($text_id)) { $text_id_counter = $text_id + 1; } } else { $text_id = $text_id_counter; $text_id_counter++; } // - protect the special case of an HTML comment being actually displayed // in the text $text = preg_replace('/<!--([\s\.]+?)-->/','##lt##!--\1--##gt##',$text); // - reformat dokuwiki syntax to HTML tag syntax $text = preg_replace('/<!--.*?-->/', '', $text); // we leave code blocks alone in terms of ampersands if (!$in_code_block) { // - ampersands aren't safe in XML... $text = str_replace('&', '&', $text); // ...except for the entities that we have registered as metadata if (isset($cover_metadata['ENTITY'])) { foreach ($cover_metadata['ENTITY'] as $entity) { if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches)) { $entity_name = $matches[1]; if ($entity_name != 'mdash') { $entity_character = html_entity_decode('&#'.$matches[2].';',ENT_NOQUOTES,'UTF-8'); $text = str_replace('&' . $entity_name . ';', '&' . $entity_name . ';', $text); // - we also convert any characters that match the entity char into // the entity $text = str_replace($entity_character, '&' . $entity_name . ';', $text); } } } } // - protect <br/> tags $text = str_replace('<br/>','%%br/%%',$text); // - encoding all of the < and > that appear in the text (rather than // true html formatting) $text = str_replace('<','<',$text); $text = str_replace('>','>',$text); // - restore <br/> tags $text = str_replace('%%br/%%','<br/>',$text); } else if ($type == 'code') { $text = str_replace('<','<',$text); $text = str_replace('>','>',$text); } // - links, oh how I hate thee // - external links are slightly easier $text = preg_replace('/\[\[http:\/\/(.*?)\|(.*?)\]\]/', '<Link url="http://\1">\2</Link>', $text); // - internals have to become the horrible <CrossRef> tags. We ignore any // number prefix on the page name as that is just used for ordering within // Dokuwiki $text = preg_replace('/\[\[\.\:(.*?)\|[^\]]+\]\]/','<CrossRef target="Chapter" ref="\1"/>', $text); // - internal links starting with hash must be on the same page $text = preg_replace('/\[\[###(.*?)\|.*?\]\]/','<CrossRef target="Part" ref="\1"/>', $text); $text = preg_replace('/\[\[##(.*?)\|.*?\]\]/','<CrossRef target="Subsection" ref="\1"/>', $text); $text = preg_replace('/\[\[#(.*?)\|.*?\]\]/','<CrossRef target="Section" ref="\1"/>', $text); // - 'external' internal wiki links are even worst - since we can't know what // the page order number for another manual's chapters might be, we instead // use a search $text = preg_replace('/\[\[\?do\=search\&id\=([^\s]+)\s+@([a-z]+):manuals:([a-z]+)\|.*?\]\]/i', '<CrossRef external="\3" lang="\2" target="Chapter" ref="\1"/>', $text); // - references to images and tables $text = preg_replace('/(?:<|<)imgref\sfigure_(.+?)(?:>|>)/','<CrossRef target="Figure" ref="\1"/>', $text); $text = preg_replace('/(?:<|<)tblref\stable_(.+?)(?:>|>)/','<CrossRef target="Table" ref="\1"/>', $text); // - explicitly convert URLs as they are a bit messy // - first all the cases of URLs in italics, without protocol $text = preg_replace('/\/\/\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text); $text = preg_replace('/\/\/\s([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text); $text = preg_replace('/\/\/\s(localhost(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text); // - now all the protocol ones (with care taken to protect // in protocol) $text = preg_replace('/\/\/\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text); $text = preg_replace('/\/\/\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text); $text = preg_replace('/\/\/\shttp:\/\/(localhost(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text); // - next we have the underlined URLs sans protocols $text = preg_replace('/__\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>\1</u>', $text); $text = preg_replace('/__\s([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>\1</u>', $text); $text = preg_replace('/__\s(localhost(?:\/.*?)?)\s__/i','<u>\1</u>', $text); // - and finally the protocol prefixed underlined URLs $text = preg_replace('/__\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text); $text = preg_replace('/__\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text); $text = preg_replace('/__\shttp:\/\/(localhost(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text); // - lets also protect any other protocols we find floating around $text = preg_replace('/(file|ftp|http):\/\//i', '\1:##DOUBLESLASH##', $text); // - italic formatting (taking care of protected double slashes) $text = preg_replace('/%%\/\/%%/', '##DOUBLESLASH##', $text); $text = preg_replace('/\/{5}/', '<i>/</i>', $text); // another special case $text = preg_replace('/\/\/(\/.+?)\s*\/\//', '<i>\1</i>', $text); // another special case $text = preg_replace('/\/\/\s*(.+?\/)\/\//', '<i>\1</i>', $text); // another special case $text = preg_replace('/\/\/\s*(.+?)\s*\/\//', '<i>\1</i>', $text); $text = preg_replace('/##DOUBLESLASH##/', '//', $text); // - bold formatting $text = preg_replace('/\*\*([^"]+?)\*\*/', '<b>\1</b>', $text); // - underline formatting $text = preg_replace('/__([^"]+?)__/', '<u>\1</u>', $text); // - decode certain entities in codeblock (just because they are valid HTML, // derp). if ($in_code_block && $allow_bad_codeblocks) { ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n"; $text = str_replace('<i>','<i>',$text); $text = str_replace('</i>','</i>',$text); //$text = str_replace('<br/>','<br/>',$text); } // - restore protected entities $text = preg_replace('/##(gt|lt)##/','&\1;',$text); // - restore protected comment blocks $text = str_replace('%!--', '<!--', $text); $text = str_replace('--%', '-->', $text); // - restore protected footnote refs $text = preg_replace('/%FootnoteRef id="([^"]+)"\/%/', '<FootnoteRef id="\1"/>', $text); // output the text block $text = trim($text); if (empty($text)) { fwrite($xml_out, '<Text id="' . $text_id . '"/>' . "\n"); } else if (!empty($type)) { fwrite($xml_out, '<Text type="' . $type . '" id="' . $text_id . '">' . $text . '</Text>' . "\n"); } else { fwrite($xml_out, '<Text id="' . $text_id . '">' . $text . '</Text>' . "\n"); } } /** outputTextBlock($xml_out, $text) **/ /** */ function processPage($xml_out, $page_name) { global $dokuwiki_path; global $seen_ids; echo "</p>\n<p><b>Export Chapter:</b> " . $page_name . "<br/>\n"; // - locate the page in question (taking into account if the user asked for a // draft version or an approved version of the manual) $page_path = ''; if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en') { $page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt'; } else { $page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name); } $page_in = @fopen($page_path, 'r'); if (!$page_in) { printError('Failed to open page for reading:' . $page_name, false); return; } // - once again we read in line-by-line, but this time we are going to output // each line as we go through. We expect to encounter certain lines in a // predefined order, and should complain if we don't find what we expect. $in_chapter = false; $in_section = false; $in_subsection = false; $in_part = false; $in_list = false; $lists = array(); $previous_listitem_type = ''; $in_figure = false; $in_table = false; $column_widths = array(); $in_code_block = false; while (($line = fgets($page_in)) !== false) { // remove newline character $line = preg_replace('/\r?\n$/','',$line); // - we need to know the 'depth' for the bulletpoint lists $depth = 0; while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == ' ') { $depth++; $line = substr($line, 2); } $first_character = substr($line, 0, 1); // - special case for the end of bullet lists if ($in_list && ($first_character != "*" && $first_character != "-")) { while (count($lists) > 0) { $list_type = array_pop($lists); if ($list_type == '*') { fwrite($xml_out, '</Bullet>' . "\n"); fwrite($xml_out, '</BulletList>' . "\n"); } else { fwrite($xml_out, '</NumberedItem>' . "\n"); fwrite($xml_out, '</NumberedList>' . "\n"); } } $in_list = false; } // - special case for the end of tables if ($in_table && $first_character != '^' && $first_character != '|') { fwrite($xml_out, '</TableContent>' . "\n"); fwrite($xml_out, '</Table>' . "\n"); $in_table = false; } // - special cases for premature closing of sections, subsections and parts if (preg_match('/<!-- close:(section|subsection|part) -->/', $line, $matches)) { // - we always try to do this (regardless of actual flag) as we must // always close the smallest 'granularity' first if ($in_part) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Part>' . "\n"); $in_part = false; } if ($in_subsection && ($matches[1] == 'section' || $matches[1] == 'subsection')) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Subsection>' . "\n"); $in_subsection = false; } if ($in_section && $matches[1] == 'section') { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Section>' . "\n"); $in_section = false; } } // - if this page is a chapter, then the first thing on the page should be // the chapter title (six equals) if (preg_match('/====== (.+) ======/', $line, $matches)) { $chapter_title = $matches[1]; $chapter_id = $page_name; if (empty($chapter_id)) { $chapter_id = generateID($chapter_title); } // - are we already processing a part? if so end it, end it now if ($in_part) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Part>' . "\n"); $in_part = false; } // - are we already processing a subsection? if so end it, end it now if ($in_subsection) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Subsection>' . "\n"); $in_subsection = false; } // - are we already processing a section? if so end it, end it now if ($in_section) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Section>' . "\n"); $in_section = false; } // - are we already processing a chapter? if so end it, end it now if ($in_chapter) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Chapter>' . "\n"); $in_chapter = false; } // - write out this chapter's header fwrite($xml_out, '<Chapter id="' . $chapter_id . '">' . "\n"); outputMetadataSingle($xml_out, $chapter_title, 'Title'); fwrite($xml_out, '<Content>' . "\n"); $in_chapter = true; } // - the next likely thing to encounter is a section heading (five equals) elseif (preg_match('/=====\s+(.+)\s+=====/', $line, $matches)) { $section_title = $matches[1]; // - check for explicit section id $section_id = ''; if (preg_match('/<!-- sid:(.+?) -->(.*)/', $section_title, $matches)) { $section_id = $matches[1]; $section_title = $matches[2]; } if (empty($section_id)) { $section_id = generateID($section_title); } // - are we already processing a part? if so end it, end it now if ($in_part) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Part>' . "\n"); $in_part = false; } // - are we already processing a subsection? if so end it, end it now if ($in_subsection) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Subsection>' . "\n"); $in_subsection = false; } // - are we already processing a section? if so end it, end it now if ($in_section) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Section>' . "\n"); $in_section = false; } // - write out this section's header fwrite($xml_out, '<Section id="' . $section_id . '">' . "\n"); outputMetadataSingle($xml_out, $section_title, 'Title'); fwrite($xml_out, '<Content>' . "\n"); $in_section = true; } // - similar for subsection heading (four equals) elseif (preg_match('/==== (.+) ====/', $line, $matches)) { $subsection_title = $matches[1]; // - check for explicit subsection id $subsection_id = ''; if (preg_match('/<!-- sid:(.+?) -->(.*)/', $subsection_title, $matches)) { $subsection_id = $matches[1]; $subsection_title = $matches[2]; } if (empty($subsection_id)) { $subsection_id = generateID($subsection_title); } // - are we already processing a part? if so end it, end it now if ($in_part) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Part>' . "\n"); $in_part = false; } // - are we already processing a subsection? if so end it, end it now if ($in_subsection) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Subsection>' . "\n"); $in_subsection = false; } // - write out this subsection's header fwrite($xml_out, '<Subsection id="' . $subsection_id . '">' . "\n"); outputMetadataSingle($xml_out, $subsection_title, 'Title'); fwrite($xml_out, '<Content>' . "\n"); $in_subsection = true; } // - and part heading (three equals) elseif (preg_match('/=== (.+) ===/', $line, $matches)) { $part_title = $matches[1]; // - check for explicit part id $part_id = ''; if (preg_match('/<!-- sid:(.+?) -->(.*)/', $part_title, $matches)) { $part_id = $matches[1]; $part_title = $matches[2]; } if (empty($part_id)) { $part_id = generateID($part_title); } // - are we already processing a part? if so end it, end it now if ($in_part) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Part>' . "\n"); $in_part = false; } // - write out this part's header fwrite($xml_out, '<Part id="' . $part_id . '">' . "\n"); outputMetadataSingle($xml_out, '**//' . $part_title . '//**', 'Title'); fwrite($xml_out, '<Content>' . "\n"); $in_part = true; } // - Ignore 5th level heading - they are only used to allow more convenient // editing of figures and tables elseif (preg_match('/== (.+) ==/', $line, $matches)) { } // - lists need special handling elseif (preg_match('/^(\*|\-)\s+(.*)/', $line, $matches)) { $list_type = $matches[1]; $list_text = $matches[2]; $list_depth = count($lists); if (!$in_list) { if ($list_type == '*') { fwrite($xml_out, '<BulletList>' . "\n"); } else { fwrite($xml_out, '<NumberedList>' . "\n"); } $in_list = true; array_push($lists, $list_type); } // - this bullet is at the same depth as previous - close the previous // point elseif ($depth == $list_depth) { $previous_list_type = end($lists); if ($previous_list_type == '*') { fwrite($xml_out, '</Bullet>' . "\n"); } else { fwrite($xml_out, '</NumberedItem>' . "\n"); } // - we don't match in type anymore... close the previous list and open // a new list of the appropriate type if ($list_type != $previous_list_type) { if ($previous_list_type == '*') { fwrite($xml_out, '</BulletList>' . "\n"); fwrite($xml_out, '<NumberedList>' . "\n"); } else { fwrite($xml_out, '</NumberedNumbered>' . "\n"); fwrite($xml_out, '<BulletList>' . "\n"); } array_pop($lists); array_push($lists, $list_type); } } else { // - we have either got deeper... if ($depth > $list_depth) { if ($list_type == '*') { fwrite($xml_out, '<BulletList>' . "\n"); } else { fwrite($xml_out, '<NumberedList>' . "\n"); } array_push($lists, $list_type); } // ... or shallower in the bullet listing if ($depth < $list_depth) { $previous_list_type = array_pop($lists); if ($previous_list_type == '*') { fwrite($xml_out, '</Bullet>' . "\n"); fwrite($xml_out, '</BulletList>' . "\n"); } else { fwrite($xml_out, '</NumberedItem>' . "\n"); fwrite($xml_out, '</NumberedList>' . "\n"); } // - we still have to close the last item too $previous_listitem_type = end($lists); if ($previous_listitem_type == '*') { fwrite($xml_out, '</Bullet>' . "\n"); } else { fwrite($xml_out, '</NumberedItem>' . "\n"); } } } if ($list_type == '*') { fwrite($xml_out, '<Bullet>' . "\n"); } else { fwrite($xml_out, '<NumberedItem>' . "\n"); } // Special Case: bullets that contain (start) a code block if (preg_match('/^(.*)<code>\s*$/', $list_text, $matches)) { $list_text = $matches[1]; $in_code_block = true; } outputTextBlock($xml_out, $list_text); // - to make things clearer, we'll process any and all code blocks within // bullets here - especially as there may be more text block *after* // the code block finishes if ($in_code_block) { $sub_line = ''; while ($in_code_block && ($sub_line = fgets($page_in)) !== false) { $sub_line = trim($sub_line); // - closing code if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches)) { $sub_line = $matches[1]; // may be empty string $in_code_block = false; } // - output another plain codeline else { fwrite($xml_out, '<CodeLine>' . $sub_line . "</CodeLine>\n"); $sub_line = ''; } } // - if sub_line still has anything in it, then add that content as a // text block if (!empty($sub_line)) { outputTextBlock($xml_out, $sub_line); } } } // - images start with an image caption 'element' elseif (preg_match('/<imgcaption\s+figure_([a-z0-9_\-]+)\|(.+)>([^<]*?)<\/imgcaption>/', $line, $matches)) { $figure_id = $matches[1]; $figure_title = $matches[2]; $image_content = $matches[3]; // - watch for the special withLineNumber flag $class_attribute = ''; if (strpos($figure_title, '%!-- withLineNumber --%') != false) { $class_attribute = ' class="withLineNumber"'; $figure_title = str_replace('%!-- withLineNumber --%','',$figure_title); } fwrite($xml_out, '<Figure id="' . $figure_id . '"' . $class_attribute . '>' . "\n"); echo '[figure: ' . $figure_id . "] \n"; fwrite($xml_out, '<Title>' . "\n"); // - decode any comments in the title (used to store explicit id // information) $figure_title = str_replace('%!--', '<!--', $figure_title); $figure_title = str_replace('--%', '-->', $figure_title); // - special case: the title may have a subtitle (as a prefix) $figure_subtitle_id = ''; $figure_subtitle = ''; // - subtitle with explicit id if (preg_match('/^(<!-- id:.+? -->\([a-z]\))\s*(.*)$/', $figure_title, $matches)) { $figure_subtitle = $matches[1]; $figure_title = $matches[2]; } // - subtitle without explicit id else if (preg_match('/^(\([a-z]\))\s*(.*)$/', $figure_title, $matches)) { $figure_subtitle = $matches[1]; $figure_title = $matches[2]; } outputTextBlock($xml_out, $figure_title); if (!empty($figure_subtitle)) { fwrite($xml_out, '<SubTitle>' . "\n"); outputTextBlock($xml_out, $figure_subtitle); fwrite($xml_out, '</SubTitle>' . "\n"); } fwrite($xml_out, '</Title>' . "\n"); // Try and find the image itself if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $image_content)) { processImage($xml_out, $line); fwrite($xml_out, '</Figure>' . "\n"); } // Didn't find an image? Weird, but mark the imgcaption as open, and // we'll chomp up the next image found as the content. else { $in_figure = true; } // - record the id to prevent repeating $seen_ids[$figure_id] = true; } // - tables start with a table caption 'element' elseif (preg_match('/<tblcaption\s+table_([a-z0-9_\-]+)\|([^>]+)>\s*<\/tblcaption>/', $line, $matches)) { $table_id = $matches[1]; $table_title = $matches[2]; if ($table_title == '##NOCAPTION##') { echo '[non-captioned table: ' . $table_id . "] \n"; // - watch for autogenerated ids... no point in outputting them if (preg_match('/^table(_\d+)?$/', $table_id)) { fwrite($xml_out, "<Table>\n"); } else { fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n"); } fwrite($xml_out, '<Title/>' . "\n"); } elseif ($table_title == '##HIDDEN##') { echo '[hidden table: ' . $table_id . "] \n"; // - watch for autogenerated ids... no point in outputting them if (preg_match('/^table(_\d+)?$/', $table_id)) { fwrite($xml_out, "<Table class=\"hidden\">\n"); } else { fwrite($xml_out, '<Table class="hidden" id="' . $table_id . '">' . "\n"); } fwrite($xml_out, '<Title/>' . "\n"); } else { echo '[table: ' . $table_id . "] \n"; // - watch for autogenerated ids... no point in outputting them if (preg_match('/^table(_\d+)?$/', $table_id)) { fwrite($xml_out, "<Table>\n"); } else { fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n"); } fwrite($xml_out, '<Title>' . "\n"); outputTextBlock($xml_out, $table_title); fwrite($xml_out, '</Title>' . "\n"); } fwrite($xml_out, '<TableContent>' . "\n"); $in_table = true; // - record the id to prevent repeating $seen_ids[$table_id] = true; } // - the second line in a table should be it's column width values elseif (preg_match('/\|<\s-\s([0-9 ]+?)\s>\|/', $line, $matches)) { $column_widths = explode(' ', $matches[1]); } // - then every row will be made of a number of cells elseif (preg_match('/^\|(.*?)\|$/', $line, $matches)) { $row_content = $matches[1]; $cell_contents = preg_split('/(\s+\||\|\s+)/', $row_content); fwrite($xml_out, '<tr>' . "\n"); foreach ($cell_contents as $index=>$cell_content) { $cell_content = trim($cell_content); $th_text = ''; if (isset($column_widths[$index])) { $th_text = '<th width="' . $column_widths[$index] . '"'; } else { $th_text = '<th'; } // - if the cell would be empty, we use the shorthand if (empty($cell_content)) { $th_text .= '/>' . "\n"; fwrite($xml_out, $th_text); } else { $th_text .= '>' . "\n"; fwrite($xml_out, $th_text); // GAH - this is proving harder than a hard thing thats hard. // The issue is that the most straightforward way of fixing this, // namely using explicit newlines (\\) in the dokuwiki txt causes // lots a legitimately translated <br/> to also be split up. I // think the only way forward would be to maybe extend the HTML // Comment plugin to also respect and process <br/> tags. Then I // can avoid transforming them, and use the \\ sentinel to // separate multi-line table cells. $cell_content_lines = explode('\\\\', $cell_content); foreach ($cell_content_lines as $cell_content) { // - watch out, as the content may be an image if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content)) { processImage($xml_out, $cell_content); } elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches)) { fwrite($xml_out, '<CodeLine>' . translateTableCodeline($matches[1]) . '</CodeLine>' . "\n"); } // - anything else it text else { outputTextBlock($xml_out, $cell_content); } } fwrite($xml_out, '</th>' . "\n"); } } fwrite($xml_out, '</tr>' . "\n"); } // - links to image media in the wiki! elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line)) { processImage($xml_out, $line); // - if we were processing a figure, then now is a good time to close it if ($in_figure) { fwrite($xml_out, '</Figure>' . "\n"); $in_figure = false; } } // - if the line starts with a <code> block, then we have a tag // for that (which is special in that it get a unique text id) elseif (preg_match('/^<code\s*\d*\s*>(.*?)(<\/code>)?$/', $line, $matches) || ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches))) { $payload = $matches[1]; $found_end = (isset($matches[2])); $in_code_block = true; // - be careful with empty lines if (empty($payload)) { // - as they may appear in the body of the code (in which case we need // to output them). The empty lines at the start or end of a code // block are just an unfortunate consequence of the support for code // line numbering. if (!$found_end && strpos($line, '<code') === false) { fwrite($xml_out, "<CodeLine/>\n"); } } elseif (preg_match('/^<!-- id:([^\s]+) -->/', $payload, $matches)) { $text_id = $matches[1]; outputTextBlock($xml_out, $payload, 'code', true); // - record the id to prevent repeating $seen_ids[$text_id] = true; } else { fwrite($xml_out, '<CodeLine>' . translateText($payload, true) . '</CodeLine>' . "\n"); } // - if we didn't find an endtag we have to keep doing code mode until // we do $in_code_block = (!$found_end); if ($found_end) { // - if we were processing a figure, then now is a good time to close it if ($in_figure) { fwrite($xml_out, '</Figure>' . "\n"); $in_figure = false; } } } // - entities on a line by themselves (i.e. references to external files) // go through verbatim elseif (preg_match('/^\s*&[a-z0-9_-]+;\s*$/', $line)) { fwrite($xml_out, $line . "\n"); } // - lines starting with > are indented text blocks elseif (preg_match('/^>(.*)$/', $line, $matches)) { $payload = $matches[1]; fwrite($xml_out, "<Indented>\n"); outputTextBlock($xml_out, $payload); fwrite($xml_out, "</Indented>\n"); } // - everything else goes straight through as a text block // - note that for code blocks, even empty lines count elseif (!empty($line)) { // - output the line of text having encoded entities etc outputTextBlock($xml_out, $line, '', $in_code_block); } } // Complete any open part if ($in_part) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Part>' . "\n"); $in_part = false; } // Complete any open subsection if ($in_subsection) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Subsection>' . "\n"); $in_subsection = false; } // Complete any open section if ($in_section) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Section>' . "\n"); $in_section = false; } // Complete any open chapter if ($in_chapter) { fwrite($xml_out, '</Content>' . "\n"); fwrite($xml_out, '</Chapter>' . "\n"); $in_chapter = false; } } /** processPage($xml_out, $page_name) **/ function processImage($xml_out, $text) { global $dokuwiki_path; global $xml_source_path; if (preg_match('/\{\{.+?([^:?]+)\?(\d+)x(\d+)(&direct)?\}\}/', $text, $matches)) { $filename = $matches[1]; $width = $matches[2]; $height = $matches[3]; // - copy the file into place $image_source_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename); $image_destination_dir = $xml_source_path . '/' . $_REQUEST['l'] . '/images'; mkAllDir($image_destination_dir); $image_destination_path = $image_destination_dir . '/' . $filename; if (copy($image_source_path, $image_destination_path)) { echo '[copying file: ' . $filename . "] \n"; chmod($image_destination_path, 0664); } else { printError('Failed to copy image into place: ' . $filename, false); } // - spit out the XML element fwrite($xml_out, '<File width="' . $width . '" height="' . $height . '" url="images/' . $filename . '"/>' . "\n"); } }