diff options
Diffstat (limited to 'buildscripts/wikibuilder/dumpHTML.inc')
-rw-r--r-- | buildscripts/wikibuilder/dumpHTML.inc | 390 |
1 files changed, 390 insertions, 0 deletions
diff --git a/buildscripts/wikibuilder/dumpHTML.inc b/buildscripts/wikibuilder/dumpHTML.inc new file mode 100644 index 00000000..5b8ca15a --- /dev/null +++ b/buildscripts/wikibuilder/dumpHTML.inc @@ -0,0 +1,390 @@ +<?php +/** + * @package MediaWiki + * @subpackage Maintenance + */ + +define( 'REPORTING_INTERVAL', 10 ); + +require_once( 'includes/ImagePage.php' ); +require_once( 'includes/CategoryPage.php' ); + +class DumpHTML { + # Destination directory + var $dest; + + # Show interlanguage links? + var $interwiki = true; + + # Depth of HTML directory tree + var $depth = 3; + + # Directory that commons images are copied into + var $sharedStaticPath; + + # Relative path to image directory + var $imageRel = 'upload'; + + # Copy commons images instead of symlinking + var $forceCopy = false; + + # Make links assuming the script path is in the same directory as + # the destination + var $alternateScriptPath = false; + + function DumpHTML( $settings ) { + foreach ( $settings as $var => $value ) { + $this->$var = $value; + } + } + + /** + * Write a set of articles specified by start and end page_id + * Skip categories and images, they will be done separately + */ + function doArticles( $start, $end = false ) { + $fname = 'DumpHTML::doArticles'; + + $this->setupGlobals(); + + if ( $end === false ) { + $dbr =& wfGetDB( DB_SLAVE ); + $end = $dbr->selectField( 'page', 'max(page_id)', false, $fname ); + } + + + for ($id = $start; $id <= $end; $id++) { + if ( !($id % REPORTING_INTERVAL) ) { + print "Processing ID: $id\r"; + } + $title = DumpTitle::newFromID( $id ); + if ( $title ) { + $ns = $title->getNamespace() ; + if ( $ns != NS_CATEGORY ) { + $this->doArticle( $title ); + } + } + } + print "\n"; + } + + function doSpecials() { + $this->doMainPage(); + + $this->setupGlobals(); + print "Special:Categories..."; + $this->doArticle( DumpTitle::makeTitle( NS_SPECIAL, 'Categories' ) ); + print "\n"; + } + + /** Write the main page as index.html */ + function doMainPage() { + global $wgMakeDumpLinks; + + print "Making index.html "; + + // Set up globals with no ../../.. in the link URLs + $this->setupGlobals( 0 ); + + // But still use that directory style + $wgMakeDumpLinks = 3; + + $title = DumpTitle::newMainPage(); + + $text = $this->getArticleHTML( $title ); + $file = fopen( "{$this->dest}/index.html", "w" ); + if ( !$file ) { + print "\nCan't open index.html for writing\n"; + return false; + } + fwrite( $file, $text ); + fclose( $file ); + print "\n"; + } + + function doImageDescriptions() { + global $wgSharedUploadDirectory; + + $fname = 'DumpHTML::doImageDescriptions'; + + $this->setupGlobals( 3 ); + + /** + * Dump image description pages that don't have an associated article, but do + * have a local image + */ + $dbr =& wfGetDB( DB_SLAVE ); + extract( $dbr->tableNames( 'image', 'page' ) ); + $res = $dbr->select( 'image', array( 'img_name' ), false, $fname ); + + $i = 0; + print "Writing image description pages for local images\n"; + $num = $dbr->numRows( $res ); + while ( $row = $dbr->fetchObject( $res ) ) { + if ( !( ++$i % REPORTING_INTERVAL ) ) { + print "Done $i of $num\r"; + } + $title = DumpTitle::makeTitle( NS_IMAGE, $row->img_name ); + if ( $title->getArticleID() ) { + // Already done by dumpHTML + continue; + } + $this->doArticle( $title ); + } + print "\n"; + + /** + * Dump images which only have a real description page on commons + */ + print "Writing description pages for commons images\n"; + $i = 0; + for ( $hash = 0; $hash < 256; $hash++ ) { + $dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash ); + $paths = glob( "{$this->sharedStaticPath}/$dir/*" ); + $paths += glob( "{$this->sharedStaticPath}/thumb/$dir/*" ); + + foreach ( $paths as $path ) { + $file = basename( $path ); + if ( !(++$i % REPORTING_INTERVAL ) ) { + print "$i\r"; + } + + $title = DumpTitle::makeTitle( NS_IMAGE, $file ); + $this->doArticle( $title ); + } + } + print "\n"; + } + + function doCategories() { + $fname = 'DumpHTML::doCategories'; + $this->setupGlobals(); + + $dbr =& wfGetDB( DB_SLAVE ); + print "Selecting categories..."; + $sql = 'SELECT DISTINCT cl_to FROM ' . $dbr->tableName( 'categorylinks' ); + $res = $dbr->query( $sql, $fname ); + + print "\nWriting " . $dbr->numRows( $res ). " category pages\n"; + $i = 0; + while ( $row = $dbr->fetchObject( $res ) ) { + if ( !(++$i % REPORTING_INTERVAL ) ) { + print "$i\r"; + } + $title = DumpTitle::makeTitle( NS_CATEGORY, $row->cl_to ); + $this->doArticle( $title ); + } + print "\n"; + } + + + /** Write an article specified by title */ + function doArticle( $title ) { + global $wgTitle, $wgSharedUploadPath, $wgSharedUploadDirectory; + global $wgUploadDirectory; + + $text = $this->getArticleHTML( $title ); + if ( $text === false ) { + return; + } + + # Parse the XHTML to find the images + $images = $this->findImages( $text ); + $this->copyImages( $images ); + + # Write to file + $this->writeArticle( $title, $text ); + } + + /** Write the given text to the file identified by the given title object */ + function writeArticle( &$title, $text ) { + $filename = strtr($title->getHashedFilename(),':~','__'); + $fullName = "{$this->dest}/$filename"; + $fullDir = dirname( $fullName ); + + wfMkdirParents( $fullDir, 0755 ); + + $file = fopen( $fullName, 'w' ); + if ( !$file ) { + print("Can't open file $fullName for writing\n"); + return; + } + + fwrite( $file, $text ); + fclose( $file ); + } + + /** Set up globals required for parsing */ + function setupGlobals( $depth = NULL ) { + global $wgUser, $wgTitle, $wgMakeDumpLinks, $wgStylePath, $wgArticlePath; + global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgSharedUploadPath; + global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath; + global $wgSharedThumbnailScriptPath, $wgEnableParserCache; + + static $oldLogo = NULL; + + if ( is_null( $depth ) ) { + $wgMakeDumpLinks = $this->depth; + } else { + $wgMakeDumpLinks = $depth; + } + + if ( $this->alternateScriptPath ) { + if ( $wgMakeDumpLinks == 0 ) { + $wgScriptPath = '.'; + } else { + $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks - 1 ); + } + } else { + $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks ); + } + + $wgArticlePath = str_repeat( '../', $wgMakeDumpLinks ) . '$1'; + + # Logo image + # Allow for repeated setup + if ( !is_null( $oldLogo ) ) { + $wgLogo = $oldLogo; + } else { + $oldLogo = $wgLogo; + } + + if ( strpos( $wgLogo, $wgUploadPath ) === 0 ) { + # If it's in the upload directory, rewrite it to the new upload directory + $wgLogo = "$wgScriptPath/{$this->imageRel}/" . substr( $wgLogo, strlen( $wgUploadPath ) + 1 ); + } elseif ( $wgLogo{0} == '/' ) { + # This is basically heuristic + # Rewrite an absolute logo path to one relative to the the script path + $wgLogo = $wgScriptPath . $wgLogo; + } + + $wgScriptPath = substr($wgScriptPath,3); + + $wgStylePath = $wgScriptPath ? "$wgScriptPath/" : ''; + $wgUploadPath = "$wgScriptPath/{$this->imageRel}"; + $wgSharedUploadPath = "$wgUploadPath/shared"; + $wgMaxCredits = -1; + $wgHideInterlangageLinks = !$this->interwiki; + $wgThumbnailScriptPath = $wgSharedThumbnailScriptPath = false; + $wgEnableParserCache = false; + + $wgUser = new User; + $wgUser->setOption( 'skin', 'htmldump' ); + $wgUser->setOption( 'editsection', 0 ); + + $this->sharedStaticPath = "$wgUploadDirectory/shared"; + + } + + /** Reads the content of a title object, executes the skin and captures the result */ + function getArticleHTML( &$title ) { + global $wgOut, $wgTitle, $wgArticle, $wgUser, $wgUseCategoryMagic; + + $wgOut = new OutputPage; + $wgOut->setParserOptions( new ParserOptions ); + + $wgTitle = $title; + if ( is_null( $wgTitle ) ) { + return false; + } + + $ns = $wgTitle->getNamespace(); + if ( $ns == NS_SPECIAL ) { + SpecialPage::executePath( $wgTitle ); + } else { + if ( $ns == NS_IMAGE ) { + $wgArticle = new ImagePage( $wgTitle ); + } elseif ( $wgUseCategoryMagic && $ns == NS_CATEGORY ) { + $wgArticle = new CategoryPage( $wgTitle ); + } else { + $wgArticle = new Article( $wgTitle ); + } + $wgArticle->view(); + } + + $sk =& $wgUser->getSkin(); + ob_start(); + $sk->outputPage( $wgOut ); + $text = ob_get_contents(); + ob_end_clean(); + + $text = str_replace(array('/:/','%7E'), array('/_/','_'), $text); + + return $text; + } + + /** Returns image paths used in an XHTML document */ + function findImages( $text ) { + global $wgOutputEncoding, $wgDumpImages; + $parser = xml_parser_create( $wgOutputEncoding ); + xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' ); + + $wgDumpImages = array(); + xml_parse( $parser, $text ); + xml_parser_free( $parser ); + + return $wgDumpImages; + } + + /** + * Copy images (or create symlinks) from commons to a static directory. + * This is necessary even if you intend to distribute all of commons, because + * the directory contents is used to work out which image description pages + * are needed. + */ + function copyImages( $images ) { + global $wiki_dir, $output_dir; + global $wgSharedUploadPath, $wgSharedUploadDirectory; + # Find shared uploads and copy them into the static directory + $sharedPathLength = strlen( $wgSharedUploadPath ); + foreach ( $images as $image => $dummy ) { + # Is it shared? + if ( strpos($image, 'upload') > 0) { + # Reconstruct full filename + $rel = substr( $image, strpos($image,'upload')+7 ); // +1 for slash + $sourceLoc = $wiki_dir."images/$rel"; + $staticLoc = "$output_dir/upload/$rel"; +// print "Copying $sourceLoc to $staticLoc\n"; + + # Copy to static directory + if ( !file_exists( $staticLoc ) ) { + wfMkdirParents( dirname( $staticLoc ), 0755 ); + //if ( function_exists( 'symlink' ) && !$this->forceCopy ) { + // symlink( $sourceLoc, $staticLoc ); + //} else { + copy( $sourceLoc, $staticLoc ); + //} + } + + if ( substr( $rel, 0, 6 ) == 'thumb/' ) { + # That was a thumbnail + # We will also copy the real image + $parts = explode( '/', $rel ); + $rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}"; + $sourceLoc = $wiki_dir."images/$rel"; + $staticLoc = "$output_dir/upload/$rel"; +# print "Copying $sourceLoc to $staticLoc\n"; + if ( !file_exists( $staticLoc ) ) { + wfMkdirParents( dirname( $staticLoc ), 0755 ); + copy( $sourceLoc, $staticLoc ); + } + } + } + } + } +} + +/** XML parser callback */ +function wfDumpStartTagHandler( $parser, $name, $attribs ) { + global $wgDumpImages; + + if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) { + $wgDumpImages[$attribs['SRC']] = true; + } +} + +/** XML parser callback */ +function wfDumpEndTagHandler( $parser, $name ) {} + +# vim: syn=php +?> |