summaryrefslogtreecommitdiff
path: root/buildscripts/wikibuilder/dumpHTML.inc
diff options
context:
space:
mode:
authorxue <>2006-10-18 01:38:47 +0000
committerxue <>2006-10-18 01:38:47 +0000
commitaf832a5a9018c7dcc2f24111d82049902269fc53 (patch)
treeb66193de9e7b75c916a92d6c951a22a90d85fed6 /buildscripts/wikibuilder/dumpHTML.inc
parent459bb9c618a8f2897da22129167e634589435e38 (diff)
merge from 3.0 branch till 1463.
Diffstat (limited to 'buildscripts/wikibuilder/dumpHTML.inc')
-rw-r--r--buildscripts/wikibuilder/dumpHTML.inc390
1 files changed, 390 insertions, 0 deletions
diff --git a/buildscripts/wikibuilder/dumpHTML.inc b/buildscripts/wikibuilder/dumpHTML.inc
new file mode 100644
index 00000000..5b8ca15a
--- /dev/null
+++ b/buildscripts/wikibuilder/dumpHTML.inc
@@ -0,0 +1,390 @@
+<?php
+/**
+ * @package MediaWiki
+ * @subpackage Maintenance
+ */
+
+define( 'REPORTING_INTERVAL', 10 );
+
+require_once( 'includes/ImagePage.php' );
+require_once( 'includes/CategoryPage.php' );
+
+class DumpHTML {
+ # Destination directory
+ var $dest;
+
+ # Show interlanguage links?
+ var $interwiki = true;
+
+ # Depth of HTML directory tree
+ var $depth = 3;
+
+ # Directory that commons images are copied into
+ var $sharedStaticPath;
+
+ # Relative path to image directory
+ var $imageRel = 'upload';
+
+ # Copy commons images instead of symlinking
+ var $forceCopy = false;
+
+ # Make links assuming the script path is in the same directory as
+ # the destination
+ var $alternateScriptPath = false;
+
+ function DumpHTML( $settings ) {
+ foreach ( $settings as $var => $value ) {
+ $this->$var = $value;
+ }
+ }
+
+ /**
+ * Write a set of articles specified by start and end page_id
+ * Skip categories and images, they will be done separately
+ */
+ function doArticles( $start, $end = false ) {
+ $fname = 'DumpHTML::doArticles';
+
+ $this->setupGlobals();
+
+ if ( $end === false ) {
+ $dbr =& wfGetDB( DB_SLAVE );
+ $end = $dbr->selectField( 'page', 'max(page_id)', false, $fname );
+ }
+
+
+ for ($id = $start; $id <= $end; $id++) {
+ if ( !($id % REPORTING_INTERVAL) ) {
+ print "Processing ID: $id\r";
+ }
+ $title = DumpTitle::newFromID( $id );
+ if ( $title ) {
+ $ns = $title->getNamespace() ;
+ if ( $ns != NS_CATEGORY ) {
+ $this->doArticle( $title );
+ }
+ }
+ }
+ print "\n";
+ }
+
+ function doSpecials() {
+ $this->doMainPage();
+
+ $this->setupGlobals();
+ print "Special:Categories...";
+ $this->doArticle( DumpTitle::makeTitle( NS_SPECIAL, 'Categories' ) );
+ print "\n";
+ }
+
+ /** Write the main page as index.html */
+ function doMainPage() {
+ global $wgMakeDumpLinks;
+
+ print "Making index.html ";
+
+ // Set up globals with no ../../.. in the link URLs
+ $this->setupGlobals( 0 );
+
+ // But still use that directory style
+ $wgMakeDumpLinks = 3;
+
+ $title = DumpTitle::newMainPage();
+
+ $text = $this->getArticleHTML( $title );
+ $file = fopen( "{$this->dest}/index.html", "w" );
+ if ( !$file ) {
+ print "\nCan't open index.html for writing\n";
+ return false;
+ }
+ fwrite( $file, $text );
+ fclose( $file );
+ print "\n";
+ }
+
+ function doImageDescriptions() {
+ global $wgSharedUploadDirectory;
+
+ $fname = 'DumpHTML::doImageDescriptions';
+
+ $this->setupGlobals( 3 );
+
+ /**
+ * Dump image description pages that don't have an associated article, but do
+ * have a local image
+ */
+ $dbr =& wfGetDB( DB_SLAVE );
+ extract( $dbr->tableNames( 'image', 'page' ) );
+ $res = $dbr->select( 'image', array( 'img_name' ), false, $fname );
+
+ $i = 0;
+ print "Writing image description pages for local images\n";
+ $num = $dbr->numRows( $res );
+ while ( $row = $dbr->fetchObject( $res ) ) {
+ if ( !( ++$i % REPORTING_INTERVAL ) ) {
+ print "Done $i of $num\r";
+ }
+ $title = DumpTitle::makeTitle( NS_IMAGE, $row->img_name );
+ if ( $title->getArticleID() ) {
+ // Already done by dumpHTML
+ continue;
+ }
+ $this->doArticle( $title );
+ }
+ print "\n";
+
+ /**
+ * Dump images which only have a real description page on commons
+ */
+ print "Writing description pages for commons images\n";
+ $i = 0;
+ for ( $hash = 0; $hash < 256; $hash++ ) {
+ $dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash );
+ $paths = glob( "{$this->sharedStaticPath}/$dir/*" );
+ $paths += glob( "{$this->sharedStaticPath}/thumb/$dir/*" );
+
+ foreach ( $paths as $path ) {
+ $file = basename( $path );
+ if ( !(++$i % REPORTING_INTERVAL ) ) {
+ print "$i\r";
+ }
+
+ $title = DumpTitle::makeTitle( NS_IMAGE, $file );
+ $this->doArticle( $title );
+ }
+ }
+ print "\n";
+ }
+
+ function doCategories() {
+ $fname = 'DumpHTML::doCategories';
+ $this->setupGlobals();
+
+ $dbr =& wfGetDB( DB_SLAVE );
+ print "Selecting categories...";
+ $sql = 'SELECT DISTINCT cl_to FROM ' . $dbr->tableName( 'categorylinks' );
+ $res = $dbr->query( $sql, $fname );
+
+ print "\nWriting " . $dbr->numRows( $res ). " category pages\n";
+ $i = 0;
+ while ( $row = $dbr->fetchObject( $res ) ) {
+ if ( !(++$i % REPORTING_INTERVAL ) ) {
+ print "$i\r";
+ }
+ $title = DumpTitle::makeTitle( NS_CATEGORY, $row->cl_to );
+ $this->doArticle( $title );
+ }
+ print "\n";
+ }
+
+
+ /** Write an article specified by title */
+ function doArticle( $title ) {
+ global $wgTitle, $wgSharedUploadPath, $wgSharedUploadDirectory;
+ global $wgUploadDirectory;
+
+ $text = $this->getArticleHTML( $title );
+ if ( $text === false ) {
+ return;
+ }
+
+ # Parse the XHTML to find the images
+ $images = $this->findImages( $text );
+ $this->copyImages( $images );
+
+ # Write to file
+ $this->writeArticle( $title, $text );
+ }
+
+ /** Write the given text to the file identified by the given title object */
+ function writeArticle( &$title, $text ) {
+ $filename = strtr($title->getHashedFilename(),':~','__');
+ $fullName = "{$this->dest}/$filename";
+ $fullDir = dirname( $fullName );
+
+ wfMkdirParents( $fullDir, 0755 );
+
+ $file = fopen( $fullName, 'w' );
+ if ( !$file ) {
+ print("Can't open file $fullName for writing\n");
+ return;
+ }
+
+ fwrite( $file, $text );
+ fclose( $file );
+ }
+
+ /** Set up globals required for parsing */
+ function setupGlobals( $depth = NULL ) {
+ global $wgUser, $wgTitle, $wgMakeDumpLinks, $wgStylePath, $wgArticlePath;
+ global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgSharedUploadPath;
+ global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath;
+ global $wgSharedThumbnailScriptPath, $wgEnableParserCache;
+
+ static $oldLogo = NULL;
+
+ if ( is_null( $depth ) ) {
+ $wgMakeDumpLinks = $this->depth;
+ } else {
+ $wgMakeDumpLinks = $depth;
+ }
+
+ if ( $this->alternateScriptPath ) {
+ if ( $wgMakeDumpLinks == 0 ) {
+ $wgScriptPath = '.';
+ } else {
+ $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks - 1 );
+ }
+ } else {
+ $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks );
+ }
+
+ $wgArticlePath = str_repeat( '../', $wgMakeDumpLinks ) . '$1';
+
+ # Logo image
+ # Allow for repeated setup
+ if ( !is_null( $oldLogo ) ) {
+ $wgLogo = $oldLogo;
+ } else {
+ $oldLogo = $wgLogo;
+ }
+
+ if ( strpos( $wgLogo, $wgUploadPath ) === 0 ) {
+ # If it's in the upload directory, rewrite it to the new upload directory
+ $wgLogo = "$wgScriptPath/{$this->imageRel}/" . substr( $wgLogo, strlen( $wgUploadPath ) + 1 );
+ } elseif ( $wgLogo{0} == '/' ) {
+ # This is basically heuristic
+ # Rewrite an absolute logo path to one relative to the the script path
+ $wgLogo = $wgScriptPath . $wgLogo;
+ }
+
+ $wgScriptPath = substr($wgScriptPath,3);
+
+ $wgStylePath = $wgScriptPath ? "$wgScriptPath/" : '';
+ $wgUploadPath = "$wgScriptPath/{$this->imageRel}";
+ $wgSharedUploadPath = "$wgUploadPath/shared";
+ $wgMaxCredits = -1;
+ $wgHideInterlangageLinks = !$this->interwiki;
+ $wgThumbnailScriptPath = $wgSharedThumbnailScriptPath = false;
+ $wgEnableParserCache = false;
+
+ $wgUser = new User;
+ $wgUser->setOption( 'skin', 'htmldump' );
+ $wgUser->setOption( 'editsection', 0 );
+
+ $this->sharedStaticPath = "$wgUploadDirectory/shared";
+
+ }
+
+ /** Reads the content of a title object, executes the skin and captures the result */
+ function getArticleHTML( &$title ) {
+ global $wgOut, $wgTitle, $wgArticle, $wgUser, $wgUseCategoryMagic;
+
+ $wgOut = new OutputPage;
+ $wgOut->setParserOptions( new ParserOptions );
+
+ $wgTitle = $title;
+ if ( is_null( $wgTitle ) ) {
+ return false;
+ }
+
+ $ns = $wgTitle->getNamespace();
+ if ( $ns == NS_SPECIAL ) {
+ SpecialPage::executePath( $wgTitle );
+ } else {
+ if ( $ns == NS_IMAGE ) {
+ $wgArticle = new ImagePage( $wgTitle );
+ } elseif ( $wgUseCategoryMagic && $ns == NS_CATEGORY ) {
+ $wgArticle = new CategoryPage( $wgTitle );
+ } else {
+ $wgArticle = new Article( $wgTitle );
+ }
+ $wgArticle->view();
+ }
+
+ $sk =& $wgUser->getSkin();
+ ob_start();
+ $sk->outputPage( $wgOut );
+ $text = ob_get_contents();
+ ob_end_clean();
+
+ $text = str_replace(array('/:/','%7E'), array('/_/','_'), $text);
+
+ return $text;
+ }
+
+ /** Returns image paths used in an XHTML document */
+ function findImages( $text ) {
+ global $wgOutputEncoding, $wgDumpImages;
+ $parser = xml_parser_create( $wgOutputEncoding );
+ xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' );
+
+ $wgDumpImages = array();
+ xml_parse( $parser, $text );
+ xml_parser_free( $parser );
+
+ return $wgDumpImages;
+ }
+
+ /**
+ * Copy images (or create symlinks) from commons to a static directory.
+ * This is necessary even if you intend to distribute all of commons, because
+ * the directory contents is used to work out which image description pages
+ * are needed.
+ */
+ function copyImages( $images ) {
+ global $wiki_dir, $output_dir;
+ global $wgSharedUploadPath, $wgSharedUploadDirectory;
+ # Find shared uploads and copy them into the static directory
+ $sharedPathLength = strlen( $wgSharedUploadPath );
+ foreach ( $images as $image => $dummy ) {
+ # Is it shared?
+ if ( strpos($image, 'upload') > 0) {
+ # Reconstruct full filename
+ $rel = substr( $image, strpos($image,'upload')+7 ); // +1 for slash
+ $sourceLoc = $wiki_dir."images/$rel";
+ $staticLoc = "$output_dir/upload/$rel";
+// print "Copying $sourceLoc to $staticLoc\n";
+
+ # Copy to static directory
+ if ( !file_exists( $staticLoc ) ) {
+ wfMkdirParents( dirname( $staticLoc ), 0755 );
+ //if ( function_exists( 'symlink' ) && !$this->forceCopy ) {
+ // symlink( $sourceLoc, $staticLoc );
+ //} else {
+ copy( $sourceLoc, $staticLoc );
+ //}
+ }
+
+ if ( substr( $rel, 0, 6 ) == 'thumb/' ) {
+ # That was a thumbnail
+ # We will also copy the real image
+ $parts = explode( '/', $rel );
+ $rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}";
+ $sourceLoc = $wiki_dir."images/$rel";
+ $staticLoc = "$output_dir/upload/$rel";
+# print "Copying $sourceLoc to $staticLoc\n";
+ if ( !file_exists( $staticLoc ) ) {
+ wfMkdirParents( dirname( $staticLoc ), 0755 );
+ copy( $sourceLoc, $staticLoc );
+ }
+ }
+ }
+ }
+ }
+}
+
+/** XML parser callback */
+function wfDumpStartTagHandler( $parser, $name, $attribs ) {
+ global $wgDumpImages;
+
+ if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) {
+ $wgDumpImages[$attribs['SRC']] = true;
+ }
+}
+
+/** XML parser callback */
+function wfDumpEndTagHandler( $parser, $name ) {}
+
+# vim: syn=php
+?>