diff --git "a/include/class-web\\MediaWikis.php" "b/include/class-web\\MediaWikis.php" index 3556193..03d0d66 100644 --- "a/include/class-web\\MediaWikis.php" +++ "b/include/class-web\\MediaWikis.php" @@ -1,67 +1,83 @@ . # websites in the Internet namespace web; /** * All the MediaWiki instances in the Internet :^) */ abstract class MediaWikis { /** * Get all the registered MediaWiki classes * * @return array */ protected static function allClasses() { return [ \wm\WikipediaIt ::class, \wm\Wikidata ::class, \wm\Commons ::class, \wm\MetaWiki ::class, \web\LandscapeforWiki::class, ]; } /** * Get all the registered MediaWiki instances * * @generator */ public static function all() { foreach( self::allClasses() as $classname ) { yield $classname::instance(); } } /** * Get a specific MediaWiki instance from its UID * * @param $uid string * @return mw\StaticSite|false */ public static function findFromUID( $uid ) { foreach( self::all() as $one ) { if( $one::UID === $uid ) { return $one; } } return false; } + /** + * Get all the registered MediaWiki UIDs ordered alphabetically + * + * @return array + */ + public static function allUIDs() { + $all = []; + + foreach( self::all() as $wiki ) { + $all[] = $wiki::UID; + } + + sort( $all ); + + return $all; + } } diff --git a/tools/mega-export.php b/tools/mega-export.php index 5e2436a..ddd76cf 100755 --- a/tools/mega-export.php +++ b/tools/mega-export.php @@ -1,201 +1,197 @@ #!/usr/bin/php . // exit if not CLI $argv or exit( 1 ); // load boz-mw require __DIR__ . '/../autoload.php'; // the number '500' gives to much $DEFAULT_LIMIT = 100; // load configuration include 'config.php'; use \cli\Log; use \cli\Input; use \cli\Opts; use \cli\Param; use \cli\ParamFlag; use \cli\ParamFlagLong; use \cli\ParamValued; use \cli\ParamValuedLong; use \web\MediaWikis; use \mw\API\PageMatcher; // all the available wiki UIDs -$mediawiki_uids = []; -foreach( MediaWikis::all() as $site ) { - $mediawiki_uids[] = $site::UID; -} -$mediawiki_uids = implode( ', ', $mediawiki_uids ); +$mediawiki_uids = implode( ', ', MediaWikis::allUIDs() ); // register all CLI parameters $opts = new Opts( [ new ParamValuedLong( 'wiki', "Available wikis: $mediawiki_uids" ), new ParamValuedLong( 'limit', "Number of revisions for each request" ), new ParamValuedLong( 'file', "Output filename" ), new ParamFlag( 'help', 'h', "Show this help and quit" ), ] ); $messages = []; // choosen wiki $wiki_uid = $opts->getArg( 'wiki' ); if( !$wiki_uid ) { $messages[] = "Please specify --wiki=WIKI"; } // page titles $page_titles = Opts::unnamedArguments(); if( !$page_titles ) { $messages[] = "Please specify some page titles"; } // output filename $filename = $opts->getArg( 'file' ); if( !$filename ) { $messages[] = "Please specify a filename"; } $limit = (int) $opts->getArg( 'limit', $DEFAULT_LIMIT ); // show the help $show_help = $opts->getArg( 'help' ); if( $show_help ) { $messages = []; } else { $show_help = $messages; } if( $show_help ) { echo "Usage:\n {$argv[ 0 ]} --wiki=WIKI --file=out.xml [OPTIONS] Page_title\n"; echo "Allowed OPTIONS:\n"; $opts->printParams(); foreach( $messages as $msg ) { echo "\nError: $msg"; } echo "\n"; exit( $opts->getArg( 'help' ) ? 0 : 1 ); } // try to open the file $file = fopen( $filename, 'w' ); if( !$file ) { Log::error( "Can't open file '$filename'" ); exit( 1 ); } $wiki = MediaWikis::findFromUID( $wiki_uid ); $wiki->login(); $requests = $wiki->createQuery( [ 'action' => 'query', 'titles' => $page_titles, 'prop' => 'revisions', 'rvprop' => [ 'ids', 'flags', 'timestamp', 'user', 'userid', 'size', 'slotsize', 'sha1', 'comment', 'content', ], 'rvslots' => 'main', 'rvlimit' => $limit, ] ); // total number of revisions $total = 0; // do not print to the out $out = '' . "\n"; foreach( $requests as $request ) { foreach( $request->query->pages as $page ) { if( isset( $page->missing ) ) { Log::error( "Page '{$page->title}' is missing" ); exit( 1 ); } $alert_much_revisions = true; foreach( $page->revisions as $i => $revision ) { // avoid nonsense revisions if( empty( $revision->comment ) ) { if( $alert_much_revisions ) { $count = count( $page->revisions ); if( $count !== $limit ) { Log::warn( "response with $count revisions instead of $limit: consider to lower your limit" ); $alert_much_revisions = false; } } continue; } $total++; foreach( $revision->slots as $slot ) { // avoid nonsense slots if( empty( $slot->contentmodel ) ) { continue; } $safe_user = htmlentities( $revision->user ); $safe_userid = htmlentities( $revision->userid ); $safe_comment = htmlentities( $revision->comment ); $safe_model = htmlentities( $slot->contentmodel ); $safe_format = htmlentities( $slot->contentformat ); $safe_text = htmlentities( $slot->{'*'} ); $out .= "\n"; $out .= "\t{$revision->revid}\n"; $out .= "\t{$revision->parentid}\n"; $out .= "\t{$revision->timestamp}\n"; $out .= "\t\n"; $out .= "\t\t$safe_user\n"; $out .= "\t\t$safe_userid\n"; $out .= "\t\n"; $out .= "\t$safe_comment ?>"; $out .= "\t$safe_model\n"; $out .= "\t$safe_format\n"; $out .= "\tsize}\">$safe_text\n"; $out .= "\t{$revision->sha1}\n"; $out .= "\n"; } } } // write the file in chunks fwrite( $file, $out ); $out = ''; } Log::info( "you mega-exported $total revisions! nice shot!" ); fwrite( $file, "\n" ); fclose( $file );