diff --git a/tools/README.md b/tools/README.md index 62f0d4c..448a7ad 100644 --- a/tools/README.md +++ b/tools/README.md @@ -1,69 +1,89 @@ # boz-mw command line tools This is a directory for command line tools that uses the boz-mw framework. Actually there are not too much tools. ## Configuration Copy the `config-example.php` to `config.php` and fill it with your bot credentials. ## Available scripts ### Replace script - `replace.php` This is a script to sobstitute text in the wikitext from an API query. e.g. to transform in *farfallese* the whole Italian Wikipedia: ```bash ./replace.php --wiki=itwiki --generator=allpages \ a afa \ e efe \ i ifi \ o ofo \ u ufu ``` e.g. to replace a simple template parameter in top of the page, e.g. from `{{Sito web|commerciale = Sì}}` to `{{Sito web|lucro = Sì}}`: ```bash ./replace.php \ --wiki=itwiki \ --generator=transcludedin \ --titles=Template:Sito_web \ --rvsection=0 \ --regex \ '/\|commerciale(.*=.*)(Sì|No|sì|no)(.*)/' \ '|lucro$1$2$3' ``` Other options: ```bash ./replace.php --help ``` You can see some [examples](./examples). -### Mega export - `mega-export.php` +### Mega export - `mega-export-xml.php` This is a script that acts similar to the `[[Special:Export]]` page, but exporting the full page history. Note that you have to provide your user credentials in the `config.php` script in order to download more than `50` revisions at time. ``` Usage: - ./mega-export.php --wiki=WIKI --file=out.xml [OPTIONS] Page_title + ./mega-export-xml.php --wiki=WIKI --file=out.xml [OPTIONS] Page_title Allowed OPTIONS: --wiki=VALUE Available wikis: itwiki, wikidatawiki, commonswiki, metawiki, landscapeforwiki --limit=VALUE Number of revisions for each request --file=VALUE Output filename --help|-h Show this help and quit ``` E.g. to download the full history of the [Software libero](https://it.wikipedia.org/wiki/Software_libero) page: ``` ./mega-export.php --wiki=itwiki --file=out.xml "Software libero" ``` -Note that actually the official MediaWiki/XML format is actually mistreated at least for the heading section: you will not obtain the namespace list, the wiki name, and other unuseful things. Just revisions. Much revisions. +### Mega export - `mega-export-csv.php` + +This is a script that acts similar to the `[[Special:Export]]` page, but exporting the full page history. + +Note that you have to provide your user credentials in the `config.php` script in order to download more than `50` revisions at time. + +``` +Usage: + ./mega-export-xml.php --wiki=WIKI --file=out.xml [OPTIONS] Page_title +Allowed OPTIONS: + --wiki=VALUE Available wikis: itwiki, wikidatawiki, commonswiki, metawiki, landscapeforwiki + --limit=VALUE Number of revisions for each request + --file=VALUE Output filename + --help|-h Show this help and quit +``` + +E.g. to download the full history of the [Software libero](https://it.wikipedia.org/wiki/Software_libero) page: + +``` +./mega-export-csv.php --wiki=itwiki --file=out.csv "Software libero" +``` diff --git a/tools/mega-export.php b/tools/mega-export-csv.php similarity index 69% copy from tools/mega-export.php copy to tools/mega-export-csv.php index 8e31354..6a233c6 100755 --- a/tools/mega-export.php +++ b/tools/mega-export-csv.php @@ -1,200 +1,197 @@ #!/usr/bin/php . // exit if not CLI $argv or exit( 1 ); // load boz-mw require __DIR__ . '/../autoload-with-laser-cannon.php'; +define( 'MEDIAWIKI_DATE_FORMAT', 'Y-m-d\TH:i:sZ'); + // the number '500' gives to much $DEFAULT_LIMIT = 100; // load configuration config_wizard( 'config.php' ); use \web\MediaWikis; use \cli\Log; // all the available wiki UIDs $mediawiki_uids = implode( ', ', MediaWikis::allUIDs() ); // register all CLI parameters $opts = cli_options() ->addValued( 'wiki', null, "Available wikis: $mediawiki_uids" ) ->addValued( 'limit', null, "Number of revisions for each request", $DEFAULT_LIMIT ) ->addValued( 'file', null, "Output filename", 'export.xml' ) ->addFlag( 'help', 'h', "Show this help and quit" ); $messages = []; // choosen wiki $wiki_uid = $opts->get( 'wiki' ); if( !$wiki_uid ) { $messages[] = "Please specify --wiki=WIKI"; } // page titles $page_titles = $opts::unnamedArguments(); if( !$page_titles ) { $messages[] = "Please specify some page titles"; } // output filename $filename = $opts->get( 'file' ); if( !$filename ) { - $messages[] = "Please specify a filename"; + // as default assume standard output + $filename = 'php://output'; } $limit = (int) $opts->get( 'limit' ); // show the help $show_help = $opts->get( 'help' ); if( $show_help ) { $messages = []; } else { $show_help = $messages; } if( $show_help ) { echo "Usage:\n {$argv[ 0 ]} --wiki=WIKI --file=export.xml [OPTIONS] Page_title\n"; echo "Allowed OPTIONS:\n"; $opts->printParams(); foreach( $messages as $msg ) { echo "\nError: $msg"; } echo "\n"; exit( $opts->get( 'help' ) ? 0 : 1 ); } // try to open the file $file = fopen( $filename, 'w' ); if( !$file ) { Log::error( "Can't open file '$filename'" ); exit( 1 ); } // pick the wiki and login $wiki = wiki( $wiki_uid )->login(); // build the MediaWiki API query $requests = $wiki->createQuery( [ 'action' => 'query', 'titles' => $page_titles, 'prop' => 'revisions', 'rvprop' => [ 'ids', 'flags', 'timestamp', 'user', 'userid', 'size', 'slotsize', 'sha1', 'comment', - 'content', ], 'rvslots' => 'main', 'rvlimit' => $limit, ] ); // total number of revisions $total = 0; $n_requests = 1; +fputcsv( $file, [ + "Date", + "Time", + "Dimension", + "User ID", + "User name", + "Comment", +] ); + // do not print to the out -$out = '' . "\n"; foreach( $requests as $request ) { // show a kind of progress Log::info( sprintf( "processing request %d (continuing from $total revisions)", $n_requests++ ) ); $response_warning_shown = false; foreach( $request->query->pages as $page ) { if( isset( $page->missing ) ) { Log::error( "Page '{$page->title}' is missing" ); exit( 1 ); } $alert_much_revisions = true; foreach( $page->revisions as $i => $revision ) { // avoid nonsense revisions if( empty( $revision->comment ) ) { $count = count( $page->revisions ); if( $count !== $limit && !$response_warning_shown ) { Log::warn( "response with $count revisions instead of $limit: consider to lower your limit (ignore if you see this just once)" ); $response_warning_shown = true; } } $total++; foreach( $revision->slots as $slot ) { - // avoid nonsense slots - if( empty( $slot->contentmodel ) ) { - continue; - } - - $safe_user = htmlentities( $revision->user ); - $safe_userid = htmlentities( $revision->userid ); - $safe_comment = htmlentities( $revision->comment ); - $safe_model = htmlentities( $slot->contentmodel ); - $safe_format = htmlentities( $slot->contentformat ); - $safe_text = htmlentities( $slot->{'*'} ); - - $out .= "\n"; - $out .= "\t{$revision->revid}\n"; - $out .= "\t{$revision->parentid}\n"; - $out .= "\t{$revision->timestamp}\n"; - $out .= "\t\n"; - $out .= "\t\t$safe_user\n"; - $out .= "\t\t$safe_userid\n"; - $out .= "\t\n"; - $out .= "\t$safe_comment ?>"; - $out .= "\t$safe_model\n"; - $out .= "\t$safe_format\n"; - $out .= "\tsize}\">$safe_text\n"; - $out .= "\t{$revision->sha1}\n"; - $out .= "\n"; + $safe_user = $revision->user; + $safe_userid = $revision->userid; + $size = $revision->size; + $timestamp = $revision->timestamp; + $comment = $revision->comment; + + $timestamp_date = DateTime::createFromFormat( MEDIAWIKI_DATE_FORMAT, $timestamp ); + $timestamp_ymd = $timestamp_date->format( 'Y-m-d' ); + $timestamp_his = $timestamp_date->format( 'H:i:s' ); + + fputcsv( $file, [ + $timestamp_ymd, + $timestamp_his, + $size, + $safe_user, + $safe_userid, + $comment, + ] ); } } } - - // write the file in chunks - fwrite( $file, $out ); - $out = ''; } Log::info( sprintf( - "you mega-exported $total revisions! nice shot! See %s", - $opts->get( 'file' ) + "you mega-exported %d revisions!", + $total ) ); -fwrite( $file, "\n" ); fclose( $file ); diff --git a/tools/mega-export.php b/tools/mega-export-xml.php similarity index 98% copy from tools/mega-export.php copy to tools/mega-export-xml.php index 8e31354..45069b5 100755 --- a/tools/mega-export.php +++ b/tools/mega-export-xml.php @@ -1,200 +1,200 @@ #!/usr/bin/php . // exit if not CLI $argv or exit( 1 ); // load boz-mw require __DIR__ . '/../autoload-with-laser-cannon.php'; // the number '500' gives to much $DEFAULT_LIMIT = 100; // load configuration config_wizard( 'config.php' ); use \web\MediaWikis; use \cli\Log; // all the available wiki UIDs $mediawiki_uids = implode( ', ', MediaWikis::allUIDs() ); // register all CLI parameters $opts = cli_options() ->addValued( 'wiki', null, "Available wikis: $mediawiki_uids" ) ->addValued( 'limit', null, "Number of revisions for each request", $DEFAULT_LIMIT ) ->addValued( 'file', null, "Output filename", 'export.xml' ) ->addFlag( 'help', 'h', "Show this help and quit" ); $messages = []; // choosen wiki $wiki_uid = $opts->get( 'wiki' ); if( !$wiki_uid ) { $messages[] = "Please specify --wiki=WIKI"; } // page titles $page_titles = $opts::unnamedArguments(); if( !$page_titles ) { $messages[] = "Please specify some page titles"; } // output filename $filename = $opts->get( 'file' ); if( !$filename ) { $messages[] = "Please specify a filename"; } $limit = (int) $opts->get( 'limit' ); // show the help $show_help = $opts->get( 'help' ); if( $show_help ) { $messages = []; } else { $show_help = $messages; } if( $show_help ) { echo "Usage:\n {$argv[ 0 ]} --wiki=WIKI --file=export.xml [OPTIONS] Page_title\n"; echo "Allowed OPTIONS:\n"; $opts->printParams(); foreach( $messages as $msg ) { echo "\nError: $msg"; } echo "\n"; exit( $opts->get( 'help' ) ? 0 : 1 ); } // try to open the file $file = fopen( $filename, 'w' ); if( !$file ) { Log::error( "Can't open file '$filename'" ); exit( 1 ); } // pick the wiki and login $wiki = wiki( $wiki_uid )->login(); // build the MediaWiki API query $requests = $wiki->createQuery( [ 'action' => 'query', 'titles' => $page_titles, 'prop' => 'revisions', 'rvprop' => [ 'ids', 'flags', 'timestamp', 'user', 'userid', 'size', 'slotsize', 'sha1', 'comment', 'content', ], 'rvslots' => 'main', 'rvlimit' => $limit, ] ); // total number of revisions $total = 0; $n_requests = 1; // do not print to the out $out = '' . "\n"; foreach( $requests as $request ) { // show a kind of progress Log::info( sprintf( "processing request %d (continuing from $total revisions)", $n_requests++ ) ); $response_warning_shown = false; foreach( $request->query->pages as $page ) { if( isset( $page->missing ) ) { Log::error( "Page '{$page->title}' is missing" ); exit( 1 ); } $alert_much_revisions = true; foreach( $page->revisions as $i => $revision ) { // avoid nonsense revisions if( empty( $revision->comment ) ) { $count = count( $page->revisions ); if( $count !== $limit && !$response_warning_shown ) { Log::warn( "response with $count revisions instead of $limit: consider to lower your limit (ignore if you see this just once)" ); $response_warning_shown = true; } } $total++; foreach( $revision->slots as $slot ) { // avoid nonsense slots if( empty( $slot->contentmodel ) ) { continue; } $safe_user = htmlentities( $revision->user ); $safe_userid = htmlentities( $revision->userid ); $safe_comment = htmlentities( $revision->comment ); $safe_model = htmlentities( $slot->contentmodel ); $safe_format = htmlentities( $slot->contentformat ); $safe_text = htmlentities( $slot->{'*'} ); $out .= "\n"; $out .= "\t{$revision->revid}\n"; $out .= "\t{$revision->parentid}\n"; $out .= "\t{$revision->timestamp}\n"; $out .= "\t\n"; $out .= "\t\t$safe_user\n"; $out .= "\t\t$safe_userid\n"; $out .= "\t\n"; $out .= "\t$safe_comment ?>"; $out .= "\t$safe_model\n"; $out .= "\t$safe_format\n"; $out .= "\tsize}\">$safe_text\n"; $out .= "\t{$revision->sha1}\n"; $out .= "\n"; } } } // write the file in chunks fwrite( $file, $out ); $out = ''; } Log::info( sprintf( "you mega-exported $total revisions! nice shot! See %s", $opts->get( 'file' ) ) ); fwrite( $file, "\n" ); fclose( $file ); diff --git a/tools/mega-export.php b/tools/mega-export.php deleted file mode 100755 index 8e31354..0000000 --- a/tools/mega-export.php +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/php -. - -// exit if not CLI -$argv or exit( 1 ); - -// load boz-mw -require __DIR__ . '/../autoload-with-laser-cannon.php'; - -// the number '500' gives to much -$DEFAULT_LIMIT = 100; - -// load configuration -config_wizard( 'config.php' ); - -use \web\MediaWikis; -use \cli\Log; - -// all the available wiki UIDs -$mediawiki_uids = implode( ', ', MediaWikis::allUIDs() ); - -// register all CLI parameters -$opts = cli_options() - ->addValued( 'wiki', null, "Available wikis: $mediawiki_uids" ) - ->addValued( 'limit', null, "Number of revisions for each request", $DEFAULT_LIMIT ) - ->addValued( 'file', null, "Output filename", 'export.xml' ) - ->addFlag( 'help', 'h', "Show this help and quit" ); - -$messages = []; - -// choosen wiki -$wiki_uid = $opts->get( 'wiki' ); -if( !$wiki_uid ) { - $messages[] = "Please specify --wiki=WIKI"; -} - -// page titles -$page_titles = $opts::unnamedArguments(); -if( !$page_titles ) { - $messages[] = "Please specify some page titles"; -} - -// output filename -$filename = $opts->get( 'file' ); -if( !$filename ) { - $messages[] = "Please specify a filename"; -} - -$limit = (int) $opts->get( 'limit' ); - -// show the help -$show_help = $opts->get( 'help' ); -if( $show_help ) { - $messages = []; -} else { - $show_help = $messages; -} - -if( $show_help ) { - echo "Usage:\n {$argv[ 0 ]} --wiki=WIKI --file=export.xml [OPTIONS] Page_title\n"; - echo "Allowed OPTIONS:\n"; - - $opts->printParams(); - - foreach( $messages as $msg ) { - echo "\nError: $msg"; - } - echo "\n"; - - exit( $opts->get( 'help' ) ? 0 : 1 ); -} - -// try to open the file -$file = fopen( $filename, 'w' ); -if( !$file ) { - Log::error( "Can't open file '$filename'" ); - exit( 1 ); -} - -// pick the wiki and login -$wiki = wiki( $wiki_uid )->login(); - -// build the MediaWiki API query -$requests = $wiki->createQuery( [ - 'action' => 'query', - 'titles' => $page_titles, - 'prop' => 'revisions', - 'rvprop' => [ - 'ids', - 'flags', - 'timestamp', - 'user', - 'userid', - 'size', - 'slotsize', - 'sha1', - 'comment', - 'content', - ], - 'rvslots' => 'main', - 'rvlimit' => $limit, -] ); - -// total number of revisions -$total = 0; - -$n_requests = 1; - -// do not print to the out -$out = '' . "\n"; -foreach( $requests as $request ) { - - // show a kind of progress - Log::info( sprintf( - "processing request %d (continuing from $total revisions)", - $n_requests++ - ) ); - - $response_warning_shown = false; - - foreach( $request->query->pages as $page ) { - - if( isset( $page->missing ) ) { - Log::error( "Page '{$page->title}' is missing" ); - exit( 1 ); - } - - $alert_much_revisions = true; - foreach( $page->revisions as $i => $revision ) { - - // avoid nonsense revisions - if( empty( $revision->comment ) ) { - $count = count( $page->revisions ); - if( $count !== $limit && !$response_warning_shown ) { - Log::warn( "response with $count revisions instead of $limit: consider to lower your limit (ignore if you see this just once)" ); - $response_warning_shown = true; - } - } - - $total++; - - foreach( $revision->slots as $slot ) { - - // avoid nonsense slots - if( empty( $slot->contentmodel ) ) { - continue; - } - - $safe_user = htmlentities( $revision->user ); - $safe_userid = htmlentities( $revision->userid ); - $safe_comment = htmlentities( $revision->comment ); - $safe_model = htmlentities( $slot->contentmodel ); - $safe_format = htmlentities( $slot->contentformat ); - $safe_text = htmlentities( $slot->{'*'} ); - - $out .= "\n"; - $out .= "\t{$revision->revid}\n"; - $out .= "\t{$revision->parentid}\n"; - $out .= "\t{$revision->timestamp}\n"; - $out .= "\t\n"; - $out .= "\t\t$safe_user\n"; - $out .= "\t\t$safe_userid\n"; - $out .= "\t\n"; - $out .= "\t$safe_comment ?>"; - $out .= "\t$safe_model\n"; - $out .= "\t$safe_format\n"; - $out .= "\tsize}\">$safe_text\n"; - $out .= "\t{$revision->sha1}\n"; - $out .= "\n"; - } - } - } - - // write the file in chunks - fwrite( $file, $out ); - $out = ''; -} - -Log::info( sprintf( - "you mega-exported $total revisions! nice shot! See %s", - $opts->get( 'file' ) -) ); - -fwrite( $file, "\n" ); -fclose( $file ); diff --git a/tools/mega-export.php b/tools/mega-export.php new file mode 120000 index 0000000..eb18978 --- /dev/null +++ b/tools/mega-export.php @@ -0,0 +1 @@ +mega-export-xml.php \ No newline at end of file