fullname.".txt", 'wb') ) { print "Failed to open {$loc->file}.txt!
" ; flush () ; } fwrite($handle, $text) ; fclose ( $handle ) ; } else if ( $mode == "gzip" ) { if ( !$gz = gzopen($loc->fullname.".gz",'w9') ) { print "Failed to open {$loc->file}.gz!
" ; flush () ; } gzwrite($gz, $text); gzclose($gz); } } function microtime_float() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } # Global functions for parsing function XML2TXT_START($parser, $name, $attrs) { global $mem , $tags ; $mem["name"] = $name ; $tags[] = $name ; if ( $name == "NAMESPACE" ) { $mem['key'] = $attrs["KEY"] ; } else if ( $name == "TEXT" ) { $mem['text'] = "" ; } } function XML2TXT_END($parser, $name) { global $mem , $namespaces , $tags , $page_counter , $dir ; if ( $mem['name'] == 'NAMESPACE' ) { $namespaces[$mem['key']] = $mem['text'] ; } else if ( $mem['name'] == 'PAGE' ) { $loc = get_file_location_global ( $dir , $mem['namespace'] , $mem['title'] , true ) ; store_file ( $loc , $mem['text'] , 'text' ) ; $page_counter++ ; if ( $page_counter % 1000 == 0 ) { print '.' ; if ( $page_counter % 50000 == 0 ) print "
" ; flush () ; } } array_pop ( $tags ) ; if ( count ( $tags ) > 0 ) { $mem['name'] = array_pop ( $tags ) ; $tags[] = $mem['name'] ; } else { $mem['name'] = "" ; } } function XML2TXT_DATA ( $parser, $data ) { global $mem , $namespaces ; if ( $mem['name'] == 'NAMESPACE' ) { $mem['text'] = $data ; } else if ( $mem['name'] == 'TITLE' ) { $ns = 0 ; foreach ( $namespaces AS $k => $v ) { if ( $k <= 0 ) continue ; if ( substr ( 0 , strlen ( $v ) + 1 ) != $v.":" ) continue ; $ns = $k ; $data = substr ( $data , strlen ( $v ) + 1 ) ; break ; } $mem['title'] = $data ; $mem['namespace'] = $ns ; } else if ( $mem['name'] == 'TEXT' ) { $mem['text'] .= $data ; } } function scan_xml_file ( $xml_filename ) { global $namespaces , $dir , $page_counter ; $xml_parser_handle = xml_parser_create(); xml_set_element_handler($xml_parser_handle, "XML2TXT_START", "XML2TXT_END"); xml_set_character_data_handler($xml_parser_handle, "XML2TXT_DATA"); if (!($parse_handle = fopen($xml_filename, 'r'))) { die("FEHLER: Datei $xml_filename nicht gefunden."); } $t1 = microtime_float() ; while ($xml_data = fread($parse_handle, 8192)) { if (!xml_parse($xml_parser_handle, $xml_data, feof($parse_handle))) { die(sprintf('XML error: %s at line %d', xml_error_string(xml_get_error_code($xml_parser_handle)), xml_get_current_line_number($xml_parser_handle))); } /* if ( $page_counter % 100 == 0 ) { $t2 = microtime_float() - $t1 ; $t3 = $t2 * 1000 / $page_counter ; print $t3 . " sec/1000 pages
" ; flush () ; }*/ } $t2 = microtime_float() - $t1 ; print "Took {$t2} seconds total.
" ; flush () ; xml_parser_free($xml_parser_handle); $handle = fopen($dir."/namespaces.txt", 'wb') ; foreach ( $namespaces AS $ns => $nst ) { $t = "{$ns}:{$nst}\n" ; fwrite($handle, $t) ; } fclose ( $handle ) ; } # MAIN $dir = array_pop ( explode ( "/" , str_replace ( "\\" , "/" , $dumpfile ) ) ) ; $dir = $basedir . "/" . str_replace ( ".xml" , "" , $dir ) ; @set_time_limit ( 0 ) ; # No time limit #ini_set('user_agent','MSIE 4\.0b2;'); # Fake user agent header ('Content-type: text/html; charset=utf-8'); @mkdir ( $dir ) ; scan_xml_file ( $dumpfile ) ; ?>