diff --git a/tools/README.md b/tools/README.md index 1cf543c..d298f35 100644 --- a/tools/README.md +++ b/tools/README.md @@ -1,69 +1,69 @@ # boz-mw command line tools This is a directory for command line tools that uses the boz-mw framework. Actually there are not too much tools. ## Configuration Copy the `config-example.php` to `config.php` and fill it with your bot credentials. ## Available scripts ### Replace script - `replace.php` This is a script to sobstitute text in the wikitext from an API query. e.g. to transform in *farfallese* the whole Italian Wikipedia: ```bash ./replace.php --wiki=itwiki --generator=allpages \ a afa \ e efe \ i ifi \ o ofo \ u ufu ``` e.g. to replace a simple template parameter in top of the page, e.g. from `{{Sito web|commerciale = Sì}}` to `{{Sito web|lucro = Sì}}`: ```bash ./replace.php \ --wiki=itwiki \ --generator=transcludedin \ --titles=Template:Sito_web \ --rvsection=0 \ --regex \ '/\|commerciale(.*=.*)(Sì|No|sì|no)(.*)/' \ '|lucro$1$2$3' ``` Other options: ```bash ./replace.php --help ``` You can see some [examples](./examples). ### Mega export - `mega-export.php` This is a script that acts similar to the `[[Special:Export]]` page, but exporting the full page history. Note that you have to provide your user credentials in the `config.php` script in order to download more than `50` revisions at time. ``` Usage: - ./mega-export.php --wiki=WIKI [OPTIONS] Page_title > filename.xml + ./mega-export.php --wiki=WIKI --file=out.xml [OPTIONS] Page_title Allowed OPTIONS: --wiki=VALUE Available wikis: itwiki, wikidatawiki, commonswiki, metawiki, landscapeforwiki --limit=VALUE Number of revisions for each request --file=VALUE Output filename --help|-h Show this help and quit ``` E.g. to download the full history of the [Software libero](https://it.wikipedia.org/wiki/Software_libero) page: ``` -./mega-export.php --wiki=itwiki "Alessandro Manzoni" > manzoni.xml +./mega-export.php --wiki=itwiki --file=manzoni.xml "Alessandro Manzoni" ``` Note that actually the official MediaWiki/XML format is actually mistreated at least for the heading section: you will not obtain the namespace list, the wiki name, and other unuseful things. Just revisions. Much revisions. diff --git a/tools/mega-export.php b/tools/mega-export.php index 3256f2e..e410cb2 100755 --- a/tools/mega-export.php +++ b/tools/mega-export.php @@ -1,156 +1,217 @@ #!/usr/bin/php . // exit if not CLI $argv or exit( 1 ); // load boz-mw require __DIR__ . '/../autoload.php'; +// the number '500' gives to much +$DEFAULT_LIMIT = 100; + // load configuration include 'config.php'; use \cli\Log; use \cli\Input; use \cli\Opts; use \cli\Param; use \cli\ParamFlag; use \cli\ParamFlagLong; use \cli\ParamValued; use \cli\ParamValuedLong; use \web\MediaWikis; use \mw\API\PageMatcher; // all the available wiki UIDs $mediawiki_uids = []; foreach( MediaWikis::all() as $site ) { $mediawiki_uids[] = $site::UID; } $mediawiki_uids = implode( ', ', $mediawiki_uids ); // register all CLI parameters $opts = new Opts( [ new ParamValuedLong( 'wiki', "Available wikis: $mediawiki_uids" ), new ParamValuedLong( 'limit', "Number of revisions for each request" ), new ParamValuedLong( 'file', "Output filename" ), new ParamFlag( 'help', 'h', "Show this help and quit" ), ] ); $messages = []; // choosen wiki $wiki_uid = $opts->getArg( 'wiki' ); if( !$wiki_uid ) { $messages[] = "Please specify --wiki=WIKI"; } // page titles $page_titles = Opts::unnamedArguments(); if( !$page_titles ) { $messages[] = "Please specify some page titles"; } -$limit = $opts->getArg( 'limit', 500 ); +// output filename +$filename = $opts->getArg( 'file' ); +if( !$filename ) { + $messages[] = "Please specify a filename"; +} + +$limit = (int) $opts->getArg( 'limit', $DEFAULT_LIMIT ); // show the help $show_help = $opts->getArg( 'help' ); if( $show_help ) { $messages = []; } else { $show_help = $messages; } if( $show_help ) { - echo "Usage:\n {$argv[ 0 ]} --wiki=WIKI [OPTIONS] Page_title > filename.xml\n"; + echo "Usage:\n {$argv[ 0 ]} --wiki=WIKI --file=out.xml [OPTIONS] Page_title\n"; echo "Allowed OPTIONS:\n"; foreach( $opts->getParams() as $param ) { $commands = []; if( $param->hasLongName() ) { $commands[] = '--' . $param->getLongName(); } if( $param->hasShortName() ) { $commands[] = '-' . $param->getShortName(); } $command = implode( '|', $commands ); if( $command && ! $param->isFlag() ) { $command .= $param->isValueOptional() ? '=[VALUE]' : '=VALUE'; } printf( ' % -20s ', $command ); if( $param->hasDescription() ) { echo ' ' . $param->getDescription(); } echo "\n"; } foreach( $messages as $msg ) { echo "\nError: $msg"; } echo "\n"; exit( $opts->getArg( 'help' ) ? 0 : 1 ); } +// try to open the file +$file = fopen( $filename, 'w' ); +if( !$file ) { + Log::error( "Can't open file '$filename'" ); + exit( 1 ); +} + $wiki = MediaWikis::findFromUID( $wiki_uid ); $wiki->login(); $requests = $wiki->createQuery( [ 'action' => 'query', 'titles' => $page_titles, 'prop' => 'revisions', 'rvprop' => [ 'ids', 'flags', 'timestamp', 'user', 'userid', 'size', 'slotsize', 'sha1', 'comment', 'content', ], 'rvslots' => 'main', 'rvlimit' => $limit, ] ); -?> - - - query->pages as $page ): ?> - revisions as $revision ): ?> - comment ) ) continue ?> - slots as $slot ): ?> - contentmodel ) ) continue ?> - - revid ?> - parentid ?> - timestamp ?> - - user ) ?> - userid ) ?> - - comment ) ?> - contentmodel ) ?> - contentformat ) ?> - {'*'} ) ?> - sha1 ) ?> - - - - - - +// total number of revisions +$total = 0; + +// do not print to the out +$out = '' . "\n"; +foreach( $requests as $request ) { + + foreach( $request->query->pages as $page ) { + + if( isset( $page->missing ) ) { + Log::error( "Page '{$page->title}' is missing" ); + exit( 1 ); + } + + $alert_much_revisions = true; + foreach( $page->revisions as $i => $revision ) { + + // avoid nonsense revisions + if( empty( $revision->comment ) ) { + if( $alert_much_revisions ) { + $count = count( $page->revisions ); + if( $count !== $limit ) { + Log::warn( "response with $count revisions instead of $limit: consider to lower your limit" ); + $alert_much_revisions = false; + } + } + continue; + } + + $total++; + + foreach( $revision->slots as $slot ) { + + // avoid nonsense slots + if( empty( $slot->contentmodel ) ) { + continue; + } + + $safe_user = htmlentities( $revision->user ); + $safe_userid = htmlentities( $revision->userid ); + $safe_comment = htmlentities( $revision->comment ); + $safe_model = htmlentities( $slot->contentmodel ); + $safe_format = htmlentities( $slot->contentformat ); + $safe_text = htmlentities( $slot->{'*'} ); + + $out .= "\n"; + $out .= "\t{$revision->revid}\n"; + $out .= "\t{$revision->parentid}\n"; + $out .= "\t{$revision->timestamp}\n"; + $out .= "\t\n"; + $out .= "\t\t$safe_user\n"; + $out .= "\t\t$safe_userid\n"; + $out .= "\t\n"; + $out .= "\t$safe_comment ?>"; + $out .= "\t$safe_model\n"; + $out .= "\t$safe_format\n"; + $out .= "\tsize}\">$safe_text\n"; + $out .= "\t{$revision->sha1}\n"; + $out .= "\n"; + } + } + } + + // write the file in chunks + fwrite( $file, $out ); + $out = ''; +} + +Log::info( "you mega-exported $total revisions! nice shot!" ); + +fwrite( $file, "\n" ); +fclose( $file );