diff --git "a/include/class-mw\\CompleteTitle.php" "b/include/class-mw\\CompleteTitle.php" index f748370..88dab4d 100644 --- "a/include/class-mw\\CompleteTitle.php" +++ "b/include/class-mw\\CompleteTitle.php" @@ -1,202 +1,203 @@ . # MediaWiki namespace mw; /** * A MediaWiki complete title (with namespace) */ class CompleteTitle { /** * Which MediaWiki site * * @var object */ private $wiki; /** * @var object */ private $ns; /** * @var object */ private $title; /** * Constructor * * @param $wiki object Dependency injection of the wiki * @param $ns object Dependency injection of the namespace * @param $title object Dependency injection of the Title object (the part after the namespace) */ public function __construct( $wiki, Ns $ns, Title $title ) { $this->wiki = $wiki; $this->ns = $ns; $this->title = $title; } /** * Get the namespace object * * @return Ns */ public function getNs() { return $this->ns; } /** * Get the title object * * @return Title */ public function getTitle() { return $this->title; } /** * Get the complete title, as displayed in a page, without underscores * * @return string */ public function getCompleteTitle() { $title = $this->getTitle()->get(); $ns = $this->getNs()->getName(); if( $ns ) { $ns .= ':'; } return $ns . $title; } /** * Get the complete title, as displayed in a page, but with underscores * * @return string */ public function getCompleteTitleUnderscored() { $title = $this->getCompleteTitle(); return TitlePart::space2underscore( $title ); } /** * Get the {{SUBPAGENAME}} for this complete page title * * It reproduces the MediaWiki {{SUBPAGENAME}} behaviour, so: * - return 'asd' from 'Discussion:The/asd' * - return 'Discussion' from 'Discussion:Asd' * * See https://www.mediawiki.org/wiki/Help:Magic_words */ public function getSubPageName() { return $this->getTitle()->getSubPageName(); } /** * Get the regex able to match this complete title * * @param $args array * ns-group-name: Name of the capturing group for the namespace * title-group-name: Name of the capturing group for the title * @return string */ public function getRegex( $args = [] ) { // default options $args = array_replace( [ 'wikilink' => true, 'ns-group-name' => null, 'title-group-name' => null, ], $args ); // namespace regex $ns = $this->getNs()->getRegex( [ 'wikilink' => $args[ 'wikilink' ], ] ); // title regex + // see mw\Title#getRegex() $title = $this->getTitle()->getRegex(); // @TODO: handle anchor // eventually group $ns = \regex\Generic::groupNamed( $ns, $args[ 'ns-group-name' ] ); $title = \regex\Generic::groupNamed( $title, $args[ 'title-group-name' ] ); return $ns . '[ _]*' . $title; } /** * Create a Wikilink to this title * * @param $alias string|false|null (NULL: whatever, false: no one) * @return object */ public function createWikilink( $alias = null ) { return new Wikilink( $this, $alias ); } /** * Get the URL to this page * * @return string */ public function getURL() { $title = $this->getCompleteTitleUnderscored(); $title = urlencode( $title ); return $this->wiki->getBaseURL() . $title; } /** * Static constructor parsing a string * * @param $wiki object * @param $s string e.g. ' Mediawiki: test ' * @return self */ public static function createParsingTitle( $wiki, $s ) { // @TODO: parse also anchor // split namespace and title $ns_raw = ''; $tokens = explode( ':', $s, 2 ); if( count( $tokens ) === 2 ) { $ns_raw = $tokens[ 0 ]; $title_raw = $tokens[ 1 ]; } else { // no namespace? that's the main namespace! $ns_raw = ''; $title_raw = $tokens[ 0 ]; } // validate namespace $ns = $wiki->findNamespace( $ns_raw ); if( ! $ns ) { // that was the main namespace with a ':' in the title $ns = $wiki->getNamespace( 0 ); $title_raw = "$ns_raw:$title_raw"; } $title = new Title( $title_raw, $wiki ); return new self( $wiki, $ns, $title ); } } diff --git "a/include/class-mw\\Title.php" "b/include/class-mw\\Title.php" index 2d957d0..d2f31a3 100644 --- "a/include/class-mw\\Title.php" +++ "b/include/class-mw\\Title.php" @@ -1,63 +1,63 @@ . # MediaWiki namespace mw; /** * A page Title without a namespace. * * See also CompleteTitle class. */ class Title extends TitlePartCapitalized { private $site; /** * Constructor * * @param $name string * @param $site object */ public function __construct( $name, $site ) { parent::__construct( $name ); $this->site = $site; } /** * Get the {{SUBPAGENAME}} for this complete page title * * Returns 'asd' from 'The/great/asd' * * See https://www.mediawiki.org/wiki/Help:Magic_words */ public function getSubPageName() { return basename( $this->get() ); } /** * Get a regex matching this title part * * @return string */ public function getRegex( $unused = null ) { return $this->site->hasCapitalLinks() ? $this->getRegexFirstCaseInsensitive() : parent::getRegex(); } } diff --git "a/include/class-mw\\TitlePart.php" "b/include/class-mw\\TitlePart.php" index cb5617e..e3edc39 100644 --- "a/include/class-mw\\TitlePart.php" +++ "b/include/class-mw\\TitlePart.php" @@ -1,104 +1,110 @@ . # MediaWiki namespace mw; /** * A string part of a wikilink, a namespace, etc. */ class TitlePart { private $s; public function __construct( $s ) { $this->set( $s ); } public static function factory( $s ) { return new static( $s ); } public function get() { return $this->s; } public function set( $s ) { $this->s = static::normalize( $s ); return $this; } public function isEmpty() { return '' === $this->get(); } public function getRegex( $delimiter = null ) { return self::regex( $this->get(), $delimiter ); } public static function normalize( $s ) { $s = trim( $s ); $s = self::underscore2space( $s ); return $s; } /** * Convert underscores to spaces * * @param string $s * @return s */ public static function underscore2space( $s ) { return str_replace( '_', ' ', $s ); } /** * Convert spaces to underscores * * @param string $s * @return s */ public static function space2underscore( $s ) { return str_replace( ' ', '_', $s ); } /** * Create a regex matching this title * * Spaces will be converted into optional tabs/spaces. * * @param $s string - * @param $delimiter string + * @param $delimiter string Set it to FALSE to do not have any delimiter. * @return string */ public static function regex( $s, $delimiter = null ) { + + // the whole system is based on the fact that delimiters are escaped + if( $delimiter === null ) { + $delimiter = '/'; + } + $s = preg_quote( $s, $delimiter ); // These are all valids // [[Main page]] // [[Main _ page]] // [[_ _ Main_ _ _ page _ _ ]] return str_replace( ' ', '[ _]+', $s ); } /** * Simple version to obtain the content of this */ public function __toString() { return $this->get(); } } diff --git "a/include/class-mw\\Wikilink.php" "b/include/class-mw\\Wikilink.php" index 310ff4c..7c0267b 100644 --- "a/include/class-mw\\Wikilink.php" +++ "b/include/class-mw\\Wikilink.php" @@ -1,255 +1,259 @@ . # MediaWiki namespace mw; /** * A MediaWiki wikilink * * Something like [[Wikipedia:Contatti|contatti]]. */ class Wikilink { /** * A valid alias value */ const WHATEVER_ALIAS = null; /** * A valid alias value */ const NO_ALIAS = false; /** * MediaWiki title * * When falsy, it means that can be whatever * * @var object|null */ private $title; /** * Displayed alias * * When NULL, it means that can be whatever, * When false, it means that there is no one. * * @var object */ private $alias; /** * Constructor * * @param $title object * @param $alias string|false|null (NULL: whatever, false: no one) */ public function __construct( CompleteTitle $title, $alias = null ) { $this->setTitle( $title ) ->setAlias( $alias ); } /** * Set the title (link) * * @param $title object * @return self */ public function setTitle( CompleteTitle $title ) { $this->title = $title; return $this; } /** * Set the alias * * @param $alias string|null|false (NULL: whatever, false: no one * @return self */ public function setAlias( $alias ) { $this->alias = $alias; return $this; } /** * Get a regex matching this wikilink's title * * If there is no title, match a generic one. * * @param $args array Arguments * 'wikilink' boolean If false, a category will categorize (default true) * @return string */ public function getRegexTitle( $args = [] ) { + + // see CompleteTitle#getRegex() + return $this->title ? $this->title->getRegex( $args ) : '[' . self::legalTitleCharset() . ']*'; } /** * Get a regex matching this wikilink's anchor * * @param $args array Arguments * anchor-group-name (string) group name for the anchor * @return string */ public function getRegexAnchor( $args = [] ) { //if( $this->title ) { // @TODO: allow CompleteTitle objects to have a specific anchor // return $this->title->getRegexAnchor( $args ); //} // allowed characters in the anchor $regex = self::legalTitleCharset(); // may be empty $regex = "[$regex]*"; // create a group for just the text after the '#' (that may be empty) $regex = \regex\Generic::groupNamed( $regex, $args[ 'anchor-group-name' ] ); // the anchor may be not specified return "( *#$regex)?"; } /** * Get a regex matching this wikilink's alias * * If there is no alias, match a generic one. * * @return string */ public function getRegexAlias() { // match the alias exactly if present if( $this->alias ) { return preg_quote( $this->alias ); } // match whatever alias otherwise (non-greedy) return '.*?'; } /** * Get the wikitext that will point to this wikilink * * @param $args array * @return string */ public function getWikitext( $args = [] ) { // default arguments $args = array_replace( [ 'wikilink' => true, ], $args ); $completetitle = $this->title; $ns = $completetitle->getNs(); $ns_name = $ns->getName(); $title = $completetitle->getTitle()->get(); // categories must me prefixed with ':' if you want a wikilink $prefix = ''; if( $ns->getID() === 14 && $args[ 'wikilink' ] ) { $prefix = ':'; } // the alias is the piped text $alias = ''; if( strlen( $this->alias ) > 1 ) { $alias = "|$this->alias"; } // @TODO: get the anchor from CompleteTitle return "[[$prefix$ns_name:$title$alias]]"; } /** * Get a regex matching this wikilink * * @param $args array Arguments to be specified * 'title-group-name' string If specified, the title will be captured in a group with this name * 'alias-group-name' string If specified, the alias will be captured in a group with this name * 'anchor-group-name' string If specified, the anchor will be captured in a group with this name * 'wikilink': bool If false, a category will categorize (default true) */ public function getRegex( $args = [] ) { // default options $args = array_replace( [ 'wikilink' => true, 'title-group-name' => null, 'alias-group-name' => null, 'anchor-group-name' => null, ], $args ); // regex matching the title $title_regex = $this->getRegexTitle( [ 'wikilink' => $args[ 'wikilink' ], ] ); + $title_regex = \regex\Generic::groupNamed( $title_regex, $args[ 'title-group-name' ] ); // regex matching the anchor $anchor_regex = $this->getRegexAnchor( $args ); // regex matching the alias (if any) $alias_regex = false; if( $this->alias !== self::NO_ALIAS ) { $alias_regex = \regex\Generic::groupNamed( $this->getRegexAlias(), $args[ 'alias-group-name' ] ); // the alias part, when it's a captch-all, is optional if( $this->alias === self::WHATEVER_ALIAS ) { // note, do not try to create an atomic group, because atomic groups do not backreference past $alias_regex = "([ _]*\|$alias_regex)?"; } } // complete regex $regex = $title_regex; $regex .= $anchor_regex; if( $alias_regex ) { $regex .= $alias_regex; } // surround with spaces $regex = \regex\Generic::spaceBurger( $regex ); // surround with brackets return "\[\[$regex\]\]"; } /** * Legal characters for a title * * @see https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars */ public static function legalTitleCharset() { return ' %!\"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+'; } /** * Legal characters for an alias * * @see https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars */ public static function legalAliasCharset() { return self::legalTitleCharset() . '#<>\[\]{}\n\t'; } }