diff --git a/README.md b/README.md index 76ca75f..511cc02 100644 --- a/README.md +++ b/README.md @@ -1,116 +1,116 @@ # MediaWiki Orphanizer bot -This software is an Italian Wikipedia bot. It delinks page titles. Infos: +This software is an Italian Wikipedia bot. It delinks page titles. Info: * https://it.wikipedia.org/wiki/Utente:OrfanizzaBot ## Installation sudo apt install git php-cli git clone --recursive https://github.com/valerio-bozzolan/MediaWikiOrphanizerBot.git ## Configuration You know you should provide your bot credentials in order to use a tool. 1. Open the file [`config-example.php`](config-example.php) with a text editor 2. Fill your bot credentials 3. Save-as `config.php` ## On-wiki configuration You need two pages: one with a list of links pointing to the pages to orphanize (pass its title via the `list` parameter), and one with generic config (to be passed via `cfg`). The latter should be a JSON page and can have the following options: * `summary` (string) - The summary to use when editing. * `list-summary` (string) - The summary to use when editing the page list. * `done-text` (string) - What to replace a processed wlink with. $1 is the pointed title. * `ns` (array|string) - To only edit pages on the given namespaces. `null` means all namespaces. You can specify also a string with values separated by a pipe, like in command line. * `warmup` (int) - number of __seconds__ to wait before starting (after last edit on the list) * `cooldown` (int) - number of __edits__ to do until shutdown (you may want to re-schedule) * `delay` (int) - number of __seconds__ to wait before saving * `turbofresa` (int) - if the list is older than this number of seconds, a turbofresa will be spawned to clear the list An example: ``` { "summary": "Bot: orphanizing links", "list-summary": "Updating list", "done-text": "* [[Special:WhatLinksHere/$1]] - {{done}}" "ns": [ 0 ], "warmup": 120, "cooldown": 10, "delay": 30 } ``` ## Usage from command line ./orphanizer.php --help ``` Welcome in your MediaWiki Orphanizer bot! Available options, most of them also on-wiki: --wiki=VALUE - Specify a wiki from its UID + Specify a wiki from its UID --cfg=VALUE - Title of an on-wiki configuration page with JSON content model + Title of an on-wiki configuration page with JSON content model --list=VALUE - Specify a pagename that should contain the wikilinks to be orphanized + Specify a pagename that should contain the wikilinks to be orphanized --summary=VALUE - Edit summary + Edit summary --list-summary=VALUE - Edit summary for editing the list + Edit summary for editing the list --done-text=VALUE - Replacement for the wikilink in the list + Replacement for the wikilink in the list --ns=VALUE - Namespace whitelist (values separated by pipe) + Namespace whitelist (values separated by pipe) --delay=VALUE - Additional delay between each edit + Additional delay between each edit --warmup=VALUE - Start only if the last edit on the list was done at least $warmup seconds ago + Start only if the last edit on the list was done at least $warmup seconds ago --cooldown=VALUE - End early when reaching this number of edits + End early when reaching this number of edits --turbofresa=VALUE - If the list is older than this number of seconds a turbofresa will be spawned to clean the list + If the list is older than this number of seconds a turbofresa will be spawned to clean the list --turbofresa-text=VALUE - Text that will be saved to clean an old list + Text that will be saved to clean an old list --turbofresa-summary=VALUE - Edit summary to be used when cleaning an old list + Edit summary to be used when cleaning an old list --seealso=VALUE - Title of your local "See also" section + Title of your local "See also" section --skip-permissions - Execute the bot even if the list was last edited by a non-sysop (or by the bot itself) + Execute the bot even if the list was last edited by a non-sysop (or by the bot itself) --debug - Increase verbosity + Increase verbosity --help -h - Show this message and quit + Show this message and quit --no-interaction - Do not confirm every change + Do not confirm every change Example: - ./orphanize.php --wiki=itwiki --list=Wikipedia:PDC/Elenco + ./orphanize.php --wiki=itwiki --list=Wikipedia:PDC/Elenco Have fun! by Valerio Bozzolan, Daimona Eaytoy ``` ## License -Copyright (C) 2019 [Valerio Bozzolan](https://it.wikipedia.org/wiki/Utente:Valerio_Bozzolan) +Copyright (C) 2019, 2020, 2021 [Valerio Bozzolan](https://it.wikipedia.org/wiki/Utente:Valerio_Bozzolan) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . diff --git a/orphanize.php b/orphanize.php index 57caab9..ed7054c 100755 --- a/orphanize.php +++ b/orphanize.php @@ -1,522 +1,522 @@ #!/usr/bin/php . namespace orphanizerbot; // die on whatever error set_error_handler( function( $errno, $errstr, $errfile, $errline ) { if( error_reporting() !== 0 ) { throw new \ErrorException( $errstr, 0, $errno, $errfile, $errline ); } } ); // do not expose from web isset( $argv[ 0 ] ) or exit( 1 ); // autoload classes require __DIR__ . '/includes/autoload.php'; // require config file file_exists( $config_path = __DIR__ . '/config.php' ) or die( "Please provide the configuration file at $config_path\n" ); require $config_path; // how much titles at time requesting - this is a MediaWiki limit define( 'MAX_TRANCHE_TITLES', 50 ); // classes used use \cli\Input; use \cli\Opts; use \cli\ParamFlag; use \cli\ParamValued; use \cli\Log; use \web\MediaWikis; use \mw\Wikilink; use \mw\Ns; use \mw\API; use \mw\API\ProtectedPageException; use \mw\API\EditConflictException; use \regex\Generic as Regex; // register available options $opts = Opts::instance()->register( [ // register arguments with a value new ParamValued( 'wiki', null, 'Specify a wiki from its UID' ), new ParamValued( 'cfg', null, 'Title of an on-wiki configuration page with JSON content model' ), new ParamValued( 'list', null, 'Specify a pagename that should contain the wikilinks to be orphanized' ), new ParamValued( 'summary', null, 'Edit summary' ), new ParamValued( 'list-summary', null, 'Edit summary for editing the list' ), new ParamValued( 'done-text', null, 'Replacement for the wikilink in the list' ), new ParamValued( 'ns', null, 'Namespace whitelist (values separated by pipe)' ), new ParamValued( 'delay', null, 'Additional delay between each edit' ), new ParamValued( 'warmup', null, 'Start only if the last edit on the list was done at least $warmup seconds ago' ), new ParamValued( 'cooldown', null, 'End early when reaching this number of edits' ), new ParamValued( 'turbofresa', null, 'If the list is older than this number of seconds a turbofresa will be spawned to clean the list' ), new ParamValued( 'turbofresa-text', null, 'Text that will be saved to clean an old list' ), new ParamValued( 'turbofresa-summary', null, 'Edit summary to be used when cleaning an old list' ), new ParamValued( 'seealso', null, 'Title of your local "See also" section' ), // register arguments without a value new ParamFlag( 'skip-permissions', null, 'Execute the bot even if the list was last edited by a non-sysop (or by the bot itself)' ), new ParamFlag( 'debug', null, 'Increase verbosity' ), new ParamFlag( 'help', 'h', 'Show this message and quit' ), new ParamFlag( 'no-interaction', null, 'Do not confirm every change' ), ] ); // show help screen if( $opts->getArg( 'help' ) ) { show_help(); } // cli-only parameters $NO_INTERACTION = $opts->getArg( 'no-interaction' ); $TITLE_SOURCE = $opts->getArg( 'list', 'Utente:OrfanizzaBot/Wikilink da orfanizzare' ); Log::info( "start" ); // increase verbosity if( $opts->getArg( 'debug' ) ) { Log::$DEBUG = true; } // wiki uid (from command line or from configuration file) $wiki_uid = Config::instance()->get( 'wiki' ); $wiki_uid = $opts->getArg( 'wiki', $wiki_uid ); if( ! $wiki_uid ) { Log::error( "please choose the wiki! exit" ); exit( 1 ); } // wiki instance $wiki = Mediawikis::findFromUid( $wiki_uid ); // try to load the wiki config try { wiki_config(); } catch( \Exception $e ) { // I don't have any clue about this but sometime happen Log::error( sprintf( "failed reading wiki configuration: %s", $e->getMessage() ) ); exit( 1 ); } // parameters available both from cli and on-wiki $SUMMARY = option( 'summary', "Bot: pages orphanization" ); $LIST_SUMMARY = option( 'list-summary', "Bot: orphanization list update" ); $DONE_TEXT = option( 'done-text', "* [[Special:WhatLinksHere/$1]] - {{done}}" ); $NS = option( 'ns' ); $WARMUP = option( 'warmup', -1 ); $COOLDOWN = option( 'cooldown', 1000 ); $DELAY = option( 'delay', 0 ); $SEEALSO = option( 'seealso', "See also" ); $TURBOFRESA = option( 'turbofresa', 86400 ); $TURBOFRESA_TEXT = option( 'turbofresa-text', "== List ==\n* ..." ); $TURBOFRESA_SUMMARY = option( 'turbofresa-summary', "Bot: list clean" ); $SKIP_PERMISSIONS = option( 'skip-permissions' ); // hardcoded values (@TODO: consider an option) $GROUP = 'sysop'; // my username (well, it's not so important, just used to have a friendlier log message) $ME = explode( '@', API::$DEFAULT_USERNAME, 2 )[ 0 ]; // query titles to be orphanized alongside the last revision of the list $responses = $wiki->createQuery( [ 'action' => 'query', 'titles' => $TITLE_SOURCE, 'prop' => [ 'links', 'revisions', ], 'rvslots' => 'main', 'rvprop' => [ 'comment', // the edit summary is used to detect if the list was already cleaned 'user', // the username is used to detect if the last user is allowed 'timestamp', // the timestamp is used to check the age of the last edit 'content', // page content ], ] ); // remember this to avoid edit conflicts $list_timestamp = null; $list_content = null; // collect links and take the last edit timestamp $titles_to_be_orphanized = []; Log::info( "reading $TITLE_SOURCE" ); foreach( $responses as $response ) { foreach( $response->query->pages as $page ) { // check if list is unexisting if( isset( $page->missing ) ) { Log::error( "missing list $TITLE_SOURCE" ); exit( 1 ); } if( isset( $page->revisions ) ) { // there is just one revision $revision = reset( $page->revisions ); // save list content $list_content = $revision->slots->main->{ '*' }; // check warmup $list_timestamp = $revision->timestamp; $timestamp_datetime = \DateTime::createFromFormat( \DateTime::ISO8601, $list_timestamp ); $seconds = time() - $timestamp_datetime->getTimestamp(); if( $seconds < $WARMUP ) { Log::info( "list edited just $seconds seconds ago: quit until warmup $WARMUP" ); exit( 1 ); } // eventually clear list if( $seconds > $TURBOFRESA ) { if( $revision->comment === $TURBOFRESA_SUMMARY ) { Log::info( "list edited $seconds seconds ago. already cleared. quit" ); } else { Log::info( "list edited $seconds seconds ago. spawning a turbofresa to clear the list. quit" ); // TODO: dedicated customizable summary // TODO: customizable content $wiki->login()->edit( [ 'title' => $TITLE_SOURCE, 'summary' => $TURBOFRESA_SUMMARY, 'text' => $TURBOFRESA_TEXT, 'basetimestamp' => $list_timestamp, 'bot' => 1, ] ); } exit( 0 ); } // check user $lastuser = $revision->user; $rights = $wiki->fetch( [ 'action' => 'query', 'list' => 'users', 'usprop' => 'groups', 'ususers' => $lastuser, ] ); // warn about that above user and eventually quit $lastuser_was = "$lastuser was the last editor: "; $groups = reset( $rights->query->users )->groups; if( in_array( $GROUP, $groups, true ) ) { Log::info( $lastuser_was . "a $GROUP. OK" ); } else { // show a friendly message if it's just me $its_me = $wiki->isLogged() && $lastuser === $wiki->getUsername() || $lastuser === $ME; if( $its_me ) { Log::info( $lastuser_was . "It's-a me, Mario! quit" ); } else { Log::error( $lastuser_was . "not a $GROUP. quit" ); } if( $SKIP_PERMISSIONS ) { Log::warn( "skip list permission failure because of 'skip-permissions' option enabled" ); } else { // it's me? exit normally. exit( $its_me ? 0 : 1 ); } } } // collect links (if any) if( isset( $page->links ) ) { foreach( $page->links as $link ) { $titles_to_be_orphanized[] = $link->title; } } } } // keep a copy $involved_pagetitles = $titles_to_be_orphanized; // log titles if( $titles_to_be_orphanized ) { Log::info( 'found ' . count( $titles_to_be_orphanized ) . ' pages to be orphanized:' ); foreach( $titles_to_be_orphanized as $title ) { Log::info( " $title" ); } } // associative array of page IDs as key and a boolean as value containg pages to be orphanized $involved_pageids = []; // note that the API accepts a maximum trance of titles while( $less_titles_to_be_orphanized = array_splice( $titles_to_be_orphanized, 0, MAX_TRANCHE_TITLES ) ) { // API arguments for the linkshere query $linksto_args = [ 'action' => 'query', 'titles' => $less_titles_to_be_orphanized, 'prop' => 'linkshere', 'lhprop' => [ 'pageid', 'title', ], 'lhshow' => '!redirect', 'lhlimit' => 300, ]; // limit to certain namespaces from command line if( $NS !== null ) { $linksto_args[ 'lhnamespace' ] = $NS; } // cumulate the linkshere page ids Log::info( "requesting linkshere..." ); $linksto = $wiki->createQuery( $linksto_args ); foreach( $linksto as $response ) { foreach( $response->query->pages as $page ) { if( isset( $page->linkshere ) ) { foreach( $page->linkshere as $linkingpage ) { if( $linkingpage->title !== $TITLE_SOURCE ) { $involved_pageids[] = (int) $linkingpage->pageid; } } } } } } // count of involved pages if( $involved_pagetitles ) { Log::info( sprintf( "found %d pages containing the %d involved wlinks", count( $involved_pageids ), count( $involved_pagetitles ) ) ); } // number of edited pages $edits = 0; // note that the API accepts a maximum tranche of IDs while( $less_involved_pageids = array_splice( $involved_pageids, 0, MAX_TRANCHE_TITLES ) ) { // query last revision $responses = $wiki->createQuery( [ 'action' => 'query', 'pageids' => $less_involved_pageids, 'prop' => 'revisions', 'rvslots' => 'main', 'rvprop' => [ 'content', 'timestamp', ], ] ); // for each response foreach( $responses as $response ) { // for each page foreach( $response->query->pages as $page ) { // avoid too many edits if( $edits > $COOLDOWN ) { Log::info( "reached cooldown: stop" ); exit( 0 ); } // page ID to be edited $pageid = $page->pageid; // does it have a revision? if( !isset( $page->revisions[ 0 ] ) ) { continue; } // the first revision $revision = $page->revisions[ 0 ]; // timestamp of the revision useful to avoid edit conflicts $timestamp = $revision->timestamp; // wikitext from the main slot of this revision $wikitext_raw = $revision->slots->main->{ '*' }; // create a Wikitext object $wikitext = $wiki->createWikitext( $wikitext_raw ); // for each of the titles to be orphanized foreach( $involved_pagetitles as $involved_pagetitle ) { // parse the title being orphanized $title = $wiki->createTitleParsing( $involved_pagetitle ); // if it's a category, remove it if( $title->getNs()->getID() === 14 ) { $wikitext->removeCategory( $title->getTitle() ); } // a wikilink with and without alias $wikilink_simple = $wiki->createWikilink( $title, Wikilink::NO_ALIAS ); $wikilink_alias = $wiki->createWikilink( $title, Wikilink::WHATEVER_ALIAS ); // replace simple links e.g. [[Hello]] $wikilink_regex_simple = $wikilink_simple->getRegex( [ 'title-group-name' => 'title', ] ); // replace links with alias e.g. [[Hello|whatever]] $wikilink_regex_alias = $wikilink_alias->getRegex( [ 'alias-group-name' => 'alias', ] ); // replace entry from "See also" section $wikilink_regex_clean = $wikilink_simple->getRegex(); $wikilink_regex_clean = Regex::spaceBurger( $wikilink_regex_clean ); $seealso = preg_quote( $SEEALSO ); $seealso_regex = '/' . - Regex::groupNamed( "\\n== *$seealso *== *((?!=).*\n)*", 'keep' ) . + Regex::groupNamed( "\\n== *$seealso *== *((?!=).*\\n)*", 'keep' ) . Regex::groupNamed( "[ \\t]*\*[ \\t]*{$wikilink_regex_clean}.*\\n", 'wlink' ) . '/'; Log::debug( "regex simple wikilink:" ); Log::debug( $wikilink_regex_simple ); Log::debug( "regex wikilink aliased:" ); Log::debug( $wikilink_regex_alias ); Log::debug( "regex see also:" ); Log::debug( $seealso_regex ); // strip out the entry from «See also» section $wikitext->pregReplaceCallback( $seealso_regex, function ( $matches ) { return $matches[ 'keep' ]; } ); // convert '[[Hello]]' to 'Hello' $wikitext->pregReplaceCallback( "/$wikilink_regex_simple/", function ( $matches ) { // fix unwanted indentations $title = ltrim( $matches[ 'title' ], ':' ); return trim( $title ); } ); // convert '[[Hello|world]]' to 'world' $wikitext->pregReplaceCallback( "/$wikilink_regex_alias/", function ( $matches ) { // fix unwanted indentations return trim( $matches[ 'alias' ] ); } ); } // end loop titles to be orphanized // check for changes and save if( $wikitext->isChanged() ) { Log::info( "changes on page $pageid:" ); foreach( $wikitext->getHumanUniqueSobstitutions() as $substitution ) { Log::info( "\t $substitution" ); } if( $NO_INTERACTION || 'n' !== Input::yesNoQuestion( "confirm changes" ) ) { try { // the entire world absolutely needs this shitty ASCII animation - trust me if( $edits && $DELAY ) { Log::info( "delay $DELAY seconds", [ 'newline' => false ] ); for( $i = 0; $i < $DELAY; $i++ ) { sleep( 1 ); echo '.'; } echo "\n"; } // eventually login and save $wiki->login()->edit( [ 'pageid' => $pageid, 'text' => $wikitext->getWikitext(), 'summary' => $SUMMARY, 'basetimestamp' => $timestamp, 'minor' => 1, 'bot' => 1, ] ); $edits++; } catch( ProtectedPageException $e ) { Log::warn( "skip protected page $pageid" ); } } // end confirmation } // end save } // end loop pages } // end loop responses } // end loop involved page IDs // content of the list $wikitext = $wiki->createWikitext( $list_content ); // remove each entry from the list foreach( $involved_pagetitles as $title_raw ) { $wlink = $wiki->createTitleParsing( $title_raw ) ->createWikilink( Wikilink::WHATEVER_ALIAS ) ->getRegex(); // strip out the whole related line and replace with something else $from = "/.*$wlink.*/"; // @todo In case done-text contains the full link to a page, and it has already been // replaced in a previous run, don't replace it again. $to = str_replace( '$1', $title_raw, $DONE_TEXT ); $wikitext->pregReplace( $from, $to ); } // update list if( $wikitext->isChanged() ) { Log::info( "removing orphanized pages from list" ); try { $wiki->login()->edit( [ 'title' => $TITLE_SOURCE, 'text' => $wikitext->getWikitext(), 'summary' => $LIST_SUMMARY, 'basetimestamp' => $list_timestamp, 'bot' => 1, ] ); } catch( ProtectedPageException $e ) { Log::warn( "can't update list because of protection" ); } catch( EditConflictException $e ) { Log::warn( "ARGHHHH! Is someone editing my list? MY PRECIOUSss LIST!?!? WHAAT?? I will find you, and I will rewrite your edit. Damn human beings... asd." ); } } else { Log::info( "nothing to be done" ); } Log::info( "end" );