diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php index 8da988f..52b3cd5 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php @@ -1,319 +1,400 @@ . /** * Find a matching metadata from a label and return a MetadataValue * * @param array $metadatas Array of known metadatas * @param string $label Original label like 'Titolo originale:' * @param string $value Value related to the matching metadata * @return MetadataValue|false Matching metadata or false if not found */ function find_matching_metadatavalue_from_label( $metadatas, $label, $value ) { // find the matching metadata foreach( $metadatas as $metadata ) { if( $metadata->matchesLabel( $label ) ) { return $metadata->createValue( $value ); } } // no metadata no party return false; } /** * Print a message */ function message( $message ) { printf( "[%s] %s\n", date( 'Y-m-d H:i:s' ), $message ); } /** * Covert an HTML link to a wikitext one */ function html_link_2_wikitext( $txt ) { return preg_replace_callback( '@(.+?)@', function( $matches ) { // eventually make the URL absolute $url = $matches[1]; if( $url[0] === '/' ) { $url = BASE_URL . $url; } return sprintf( '[%s %s]', $url, $matches[2] ); }, $txt ); // Sculture } /** * Require a certain page from the template directory * * It will eventually echo something. * * @param $name string page name * @param $args mixed arguments to be passed to the page scope */ function template( $template, $template_args = [] ) { extract( $template_args, EXTR_SKIP ); return require $template; } /** * Get the template output * * It will echo nothing. * * @param $name string page name (to be sanitized) * @param $args mixed arguments to be passed to the page scope * @see template() * @return string The template output */ function template_content( $name, $args = [] ) { ob_start(); template( $name, $args ); $text = ob_get_contents(); ob_end_clean(); return $text; } /** * Generator of name variants * * @param string $name * @return string */ function generator_name_variants( $name ) { // "Anderson, Domenico" → "Domenico Anderson" $parts = explode( ", ", $name ); if( count( $parts ) === 2 ) { yield "{$parts[1]} {$parts[0]}"; yield "{$parts[0]} {$parts[1]}"; } else { yield $name; } } /** * Generate the first name variant */ function first_name_variant( $name ) { foreach( generator_name_variants( $name ) as $variant ) { return $variant; } return $name; } /** * Search an Author in Wikidata * * @param string $name * @return string Q-ID or NULL */ function search_author_in_wikidata( $name ) { // terms that can be found in a description to classify someone $TERMS = [ 'photographer', 'fotograf', ]; $all = [ 'good' => [], 'undetected' => [], ]; // generate a list of possible query terms foreach( generator_name_variants( $name ) as $variant ) { // query Wikidata results using wbsearcentity API $candidates = find_wikidata_entity_by_title( $variant ); foreach( $candidates as $id => $description ) { // check if this Wikidata result matches one of the well-known terms $found = is_term_found( $TERMS, $description ); // index in the right namespace $key = $found ? 'good' : 'undetected'; $all[ $key ][ $id ] = $description; } // if something good was found, quit earlier with the first ID. otherwise continue if( count( $all[ 'good' ] ) === 1 ) { return array_keys( $all[ 'good' ] )[0]; } } // check if something was undetected if( $all[ 'undetected' ] ) { print_r( $all ); throw new Exception( "undetected. TODO: pick" ); } if( $all[ 'good' ] ) { print_r( $all ); throw new Exception( "detected multiple good possibilities. TODO: pick" ); } // nothing found return null; } /** * Search a Creator in Commons * * @param string $name * @return string Q-ID or NULL */ function search_creator_on_commons( $name ) { $wiki = \wm\Commons::instance(); // generate a list of possible query terms foreach( generator_name_variants( $name ) as $variant ) { $title = "Creator:$variant"; if( wiki_page_id( $wiki, $title ) ) { return $title; } } return false; } /** * Search something in Wikidata * * @param string $search * @return array Associative array of ID and its description */ function find_wikidata_entity_by_title( $search ) { $founds = []; $wikidata = \wm\Wikidata::instance(); // https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities $queries = $wikidata->createQuery( [ 'action' => 'wbsearchentities', 'search' => $search, 'language' => 'it', 'type' => 'item', ] ); // loop queries foreach( $queries as $query ) { $results = $query->search ?? []; foreach( $results as $result ) { $id = $result->id; $description = $result->description; // store as ID => description $founds[ $id ] = $description; } } return $founds; } /** * Test an array of terms and return true if one is found in the subject * * @param array $terms * @param string $subject * @return bool */ function is_term_found( $terms, $subject ) { // for each term foreach( $terms as $term ) { // check if the term is part of the subject if( strpos( $subject, $term ) !== false ) { // gotcha! return true; } } return false; } /** * Check if a page exists * * @return int|false */ function wiki_page_id( $wiki, $title ) { $result = $wiki->fetch( [ 'action' => 'query', 'prop' => 'info', 'titles' => $title, ] ); $pages = $result->query->pages ?? []; foreach( $pages as $page ) { // no page no party if( isset( $page->missing ) ) { return false; } // that's OK return $page->pageid; } // no page no party throw new Exception( "what" ); } /** * Parse "27x20 cm" and return a {{Size}} template if possible * * @return string */ function parse_size( $size ) { $found = preg_match( '/^ *([0-9]+)x([0-9]+) *([a-zA-Z]+) *$/', $size, $matches ); if( $found ) { $w = $matches[ 1 ]; $h = $matches[ 2 ]; $unit = $matches[ 3 ]; return "{{Size|unit = $unit |width = $w |height = $h}}"; } return $size; } + +/** + * Parse an Italian date to a Commons date + */ +function italian_date_2_commons( $date ) { + + $date_lower = strtolower( $date ); + + $MESI = [ + 'gennaio' => "01", + 'febbraio' => "02", + 'marzo' => "03", + 'aprile' => "04", + 'maggio' => "05", + 'giugno' => "06", + 'luglio' => "07", + 'agosto' => "08", + 'settembre' => "09", + 'ottobre' => "10", + 'novembre' => "11", + 'dicembre' => "12", + ]; + + // Maggio 2000 -> '2000-05' + $found_month_year = preg_match( '/^([a-z]+) +([0-9]{4})$/', $date_lower, $matches ); + if( $found_month_year ) { + $month = strtolower( $matches[1] ); + $year = $matches[2]; + $month_numeric = $MESI[ $month ] ?? null; + if( $month_numeric ) { + return "$year-$month_numeric"; + } + } + + // 1999-2000 + $found_year_range = preg_match( '/^([0-9]{4})-([0-9]{4})$/', $date, $matches ); + if( $found_year_range ) { + $start = $matches[1]; + $end = $matches[2]; + return "{{Other date|-|$start|$end}}"; + } + + // Prima del 1893 + $before_year = preg_match( '/^prima +del +([0-9]{4})$/', $date_lower, $matches ); + if( $before_year ) { + $year = $matches[1]; + return "{{Other date|<|$year}}"; + } + + // Prima del settembre 1871 + $before_month_year = preg_match( '/^prima +(del |di |dell\') *([a-z]+) +([0-9]{4})$/', $date_lower, $matches ); + if( $before_month_year ) { + $prep = $matches[1]; + $month = $matches[2]; + $year = $matches[3]; + $month_numeric = $MESI[ $month ] ?? null; + if( $month_numeric ) { + return "{{Other date|<|$year-$month_numeric}}"; + } + } + + return $date; +} + +/** + * Convert an Italian tequique to a {{Teqnique}} Wikimedia Commons template + */ +function italian_technique_2_commons_template( $description ) { + + $description_low = strtolower( $description ); + + if( strpos( $description_low, "procedimento all'albumina" ) !== -1 ) { + return "{{Technique|albumen print}}"; + } + + if( strpos( $description_low, "gelatina ai sali d'argento" ) !== -1 ) { + return "{{Technique|albumen silver print}}"; + } + + return $description; +} diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php index 74c6e2e..7cc629c 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php @@ -1,75 +1,79 @@ . // this is a template to build a generic Commons file description // this is not used, same as description ?> =={{int:filedesc}}== -{{Artwork -|author = +{{Art photo +|photographer = |title = |description = |date = |source = -|medium = +|medium = |dimensions = +|accession number = + +|place of creation = + |institution = {{Institution:Iconoteca dell'Accademia di architettura di Mendrisio}} -|department = Collezione Biblioteca +|department = [https://iconoteca.arc.usi.ch/it/collezione/193 Collezione Biblioteca] |inscriptions = { 'Iscrizione' } ) ) { // sometime they put "asasdd" in quotes, so strip them $METADATA->{ 'Iscrizione' } = trim( $METADATA->{ 'Iscrizione' }, '"' ); printf( "{{Inscription|1 = %s}}", $METADATA->{'Iscrizione'} ); } ?> |permission = |other versions = }} == {{int:metadata}} == {| class="wikitable" $value ) { // line row if( !$first ) { echo "|-\n"; } echo "! $key\n"; echo "| $value\n"; $first = false; } ?> |} [[Category:Collezione Biblioteca - Iconoteca dell'architettura in Mendrisio, Switzerland]] diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php index b386ddc..1056fc5 100755 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php @@ -1,389 +1,475 @@ #!/usr/bin/php . // autoload framework require __DIR__ . '/../includes/boz-mw/autoload.php'; // require some values require 'bootstrap.php'; // load configuration file or create one cli\ConfigWizard::requireOrCreate( __DIR__ . '/../config.php' ); // https://commons.wikimedia.org/wiki/Commons:Bots/Requests/Valerio_Bozzolan_bot_(6) $COMMONS_CONSENSUS_PAGE = "[[Commons:Bots/Requests/Valerio Bozzolan bot (6)|authorized import from Academy of architecture Library of Mendrisio]]"; // load Wikimedia Commons $commons = \wm\Commons::instance(); use \cli\Log; use \cli\Input; use \cli\Opts; +use \cli\ParamValuedLong; use \cli\ParamFlag; use \cli\ParamFlagLong; // register all CLI parameters $opts = new Opts( [ - new ParamFlagLong( 'porcelain', "Do nothing" ), - new ParamFlag( 'help', 'h', "Show this help and quit" ), + new ParamFlagLong( 'porcelain', "Do nothing" ), + new ParamFlagLong( 'preview', "Show a preview of the saved wikitext" ), + new ParamFlagLong( 'force-upload', "Force a re-upload even if the page exists" ), + new ParamValuedLong( 'start-from', "Start from a specific row (default to 1)" ), + new ParamValuedLong( 'limit', "Process only this number of results" ), + new ParamFlag( 'help', 'h', "Show this help and quit" ), ] ); // arguments $unnamed_opts = Opts::unnamedArguments(); $dir = $unnamed_opts[0] ?? null; $template = $unnamed_opts[1] ?? null; // porcelain mode means that nothing have to be saved $PORCELAIN = $opts->getArg( 'porcelain' ); +// get a preview of the current wikitext +$PREVIEW = $opts->getArg( 'preview' ); + +// check if you have to force the upload +$FORCE_UPLOAD = $opts->getArg( 'force-upload' ); + +// start from this row +$START_FROM = $opts->getArg( 'start-from' ); + +// limit to this number of results +$LIMIT = $opts->getArg( 'limit' ); + // show the help $show_help = $opts->getArg( 'help' ); // no dir no party if( !$dir || !$template ) { $show_help = true; } // show an help message if( $show_help ) { echo "Usage:\n {$argv[ 0 ]} [OPTIONS] path/data/ path/template/name.tpl\n\n"; echo "Allowed OPTIONS:\n"; $opts->printParams(); exit( $opts->getArg( 'help' ) ? 0 : 1 ); } // missing creators in Wikimedia Commons $MISSING_COMMONS_CREATOR = []; $file_pattern = $dir . '/' . '*.jpg'; // doi_by_name $doi_by_name = []; // array of duplicate DOIs $duplicate_dois = []; +// duplicate SHA1 of filenames +$duplicate_sha1 = []; + // scan the directory - this is written as-is to do not disturb GNU nano with slash and star. asd foreach( glob( $file_pattern ) as $file ) { // no data no party $file_data = @file_get_contents( "$file.json" ); $file_data = json_decode( $file_data ); if( !$file_data ) { echo "skip $file missing data\n"; continue; } // check available metadata $img_id = $file_data->{"ID immagine"}; - $title = $file_data->{"Titolo opera"}; + // DOI by name if( empty( $doi_by_name[ $title ] ) ) { $doi_by_name[ $title ] = []; } else { $duplicate_dois[] = $img_id; } + $doi_by_name[ $title ] [] = $img_id; + - $doi_by_name[ $title ][] = $img_id; + // title by SHA1 + $sha1 = sha1_file( $file ); + if( empty( $duplicate_sha1[ $sha1 ] ) ) { + $duplicate_sha1[ $sha1 ] = []; + } + $duplicate_sha1[ $sha1 ][] = "$title - DOI $img_id"; } // find duplicates $duplicates = false; foreach( $doi_by_name as $title => $dois ) { // a title should be unique by DOI if( count( $dois ) > 1 ) { Log::warn( sprintf( "found duplicate title '%s' DOIs %s", $title, implode( ', ', $dois ) ) ); $duplicates = true; } } -// if duplicates, no party -if( $duplicates ) { - //exit( 2 ); +foreach( $duplicate_sha1 as $sha1 => $titles ) { + + if( count( $titles ) > 1 ) { + + Log::warn( sprintf( + "found duplicate file '%s'", + implode( ', ', $titles ) + ) ); + + $duplicates = true; + } } // login in Commons $commons->login(); // columns to be displayed in the report $REPORT_COLUMNS = [ + 'N', 'FILE_WLINK', 'FILE_THUMB', 'TITLE', 'DESCRIPTION', 'LICENSE', 'DATE', 'AUTHOR', 'CREATOR_COMMONS_LINK', 'SIZE_TEMPLATE', 'MEDIUM', + 'MEDIUM_TEMPLATE', + 'PLACE_CREATION', 'DOI_ID', 'SOURCE', ]; // write report $write_report_csv = fopen( 'write-report.csv', 'w' ); $write_report_wiki = fopen( 'write-report.wiki', 'w' ); // columns to be displayed in the log $log_args = []; foreach( $REPORT_COLUMNS as $k => $v ) { $log_args[] = $v; } // write reports fputcsv( $write_report_csv, $log_args ); fwrite( $write_report_wiki, "{| class=\"wikitable\"\n|-\n! " . implode( "\n! ", $log_args ) . "\n" ); +$row = 0; +$processeds = 0; + // scan the directory - this is written as-is to do not disturb GNU nano with slash and star. asd foreach( glob( $file_pattern ) as $file ) { // no data no party $file_data = @file_get_contents( "$file.json" ); $file_data = json_decode( $file_data ); if( !$file_data ) { echo "skip $file missing data\n"; continue; } + $row++; + + // start from this element + if( $START_FROM ) { + if( $row < $START_FROM ) { + // skip + continue; + } else { + // continue normally + $START_FROM = null; + } + } + + // number of processed images + $processeds++; + + // apply limit + if( $LIMIT && $processeds > $LIMIT ) { + Log::info( "reached limit of $LIMIT" ); + exit( 0 ); + } + // check available metadata $img_id = $file_data->{"ID immagine"}; $title = $file_data->{"Titolo opera"}; $title_orig = $file_data->{"Titolo originale"} ?? null; $collection = $file_data->{"Collezione"} ?? null; $license = $file_data->{"Licenza"} ?? null; $type = $file_data->{"Tipologia di risorsa"} ?? "Fotografia"; $size = $file_data->{"Dimensioni"} ?? ''; $material = $file_data->{"Tipo materiale"} ?? null; $author_name = $file_data->{"Nome creatore"} ?? null; $author = $file_data->{"Creatore"} ?? $author_name; $date = $file_data->{"Data"} ?? $file_data->{"Data creazione"} ?? null; $process = $file_data->{"Processo e tecnica"} ?? null; + $place = $file_data->{"Luogo creazione"} ?? null; // build the |medium= parameter // example: "Carta. Fatto cor culo." $medium_parts = []; if( $material ) { $medium_parts[] = $material; } if( $process ) { $medium_parts[] = $process; } $medium = implode( '. ', $medium_parts ); + // obtain a {{Technique}} template parsing keywords from $medium + $medium_template = italian_technique_2_commons_template( $process ); // check license $license_templates = ''; if( $license === 'https://creativecommons.org/licenses/by-sa/4.0/deed.it' ) { $license_templates .= "{{Cc-by-sa-4.0}}"; } else { throw new Exception( "unknown license $license" ); } // source URL $source_url = null; if( $img_id ) { $source_url = sprintf( INVENTORY_URL_FORMAT, $img_id ); } // drop nonsense authors if( $author === 'Autore non identificato' ) { $author = null; } if( $author === 'ignoto' ) { $author = null; } // creator on Wikimedia Commons $creator_commons = null; $creator_commons_template = null; $creator_commons_link = null; if( $author ) { $creator_commons = search_creator_on_commons( $author ); if( $creator_commons ) { $creator_commons_template = '{{' . $creator_commons . '}}'; $creator_commons_link = "[[$creator_commons]]"; } else { $author_possible_variant = first_name_variant( $author ); $creator_commons_template = $author_possible_variant; $MISSING_COMMONS_CREATOR[ $author_possible_variant ] = $MISSING_COMMONS_CREATOR[ $author_possible_variant ] ?? 0; $MISSING_COMMONS_CREATOR[ $author_possible_variant ]++; Log::warn( "missing Wikimedia Commons [[Creator:$author_possible_variant]]" ); } } // parse the image size $size_template = parse_size( $size ); // check if the filename exists $filename = "$title.jpg"; $filename_complete = "File:$filename"; - $filename_complete_unique = "File:$title (DOI $img_id).jpg"; + $filename_unique = "$title (DOI $img_id).jpg"; + $filename_unique_complete = "File:$filename_unique"; // if this is a duplicate, make the title unique if( in_array( $img_id, $duplicate_dois ) ) { - $filename_complete = $filename_complete_unique; + $filename = $filename_unique; + $filename_complete = $filename_unique_complete; } // template arguments $template_args = [ + 'N' => $row, 'FILE_THUMB' => "[[$filename_complete|100px]]", 'FILE_WLINK' => "[[:$filename_complete]]", 'TITLE' => $title_orig ? "{{it|$title_orig}}" : '', 'DESCRIPTION' => $title ? "{{it|$title}}" : '', 'LICENSE' => $license, 'LICENSE_TEMPLATES' => $license_templates, - 'DATE' => $date, + 'DATE' => italian_date_2_commons( $date ), 'METADATA' => $file_data, 'DOI_ID' => $img_id, 'SOURCE' => $source_url, 'AUTHOR' => $author, 'CREATOR_COMMONS' => $creator_commons_template, 'CREATOR_COMMONS_LINK' => $creator_commons_link, 'SIZE_TEMPLATE' => $size_template, 'MEDIUM' => $medium, + 'MEDIUM_TEMPLATE' => $medium_template, + 'PLACE_CREATION' => $place, ]; // build the page content $page_content = template_content( $template, $template_args ); + // check if you want to show a preview + if( $PREVIEW ) { + echo "---\n"; + echo $page_content; + echo "---\n"; + } + // columns to be displayed in the log $log_args = []; foreach( $REPORT_COLUMNS as $column ) { $log_args[] = $template_args[ $column ]; } // write reports fputcsv( $write_report_csv, $log_args ); fwrite( $write_report_wiki, "|-\n| " . implode( "\n| ", $log_args ) . "\n" ); // check if the Commons page exists $commons_page_id = wiki_page_id( $commons, $filename_complete ); - if( $commons_page_id ) { + if( $commons_page_id && !$FORCE_UPLOAD ) { // page exists Log::info( sprintf( "updating https://commons.wikimedia.org/wiki/%s", rawurlencode( $filename_complete ) ) ); // eventually skip saving if( !$PORCELAIN ) { Log::info( sprintf( "Saving..." ) ); // save // https://www.mediawiki.org/w/api.php?action=help&modules=parse $result = $commons->edit( [ 'title' => $filename_complete, 'text' => $page_content, 'summary' => "Bot: $COMMONS_CONSENSUS_PAGE", 'minor' => true, 'bot' => true, ] ); // eventually wait some time if something was changed if( isset( $result->edit->nochange ) ) { Log::info( "no change" ); } else { Log::info( "saved" ); sleep( 5 ); } } } else { // print a message Log::info( sprintf( "try to upload https://commons.wikimedia.org/wiki/%s (DOI %s)", rawurlencode( $filename_complete ), $img_id ) ); // upload this damn image try { if( !$PORCELAIN ) { + // https://www.mediawiki.org/w/api.php?action=help&modules=upload $response = $commons->upload( [ - 'comment' => "Bot: $COMMONS_CONSENSUS_PAGE", - 'text' => $page_content, - 'filename' => "$title.jpg", + 'comment' => "Bot: $COMMONS_CONSENSUS_PAGE", + 'text' => $page_content, + 'filename' => $filename, + 'ignorewarnings' => $FORCE_UPLOAD, \network\ContentDisposition::createFromNameURLType( 'file', $file, 'image/jpg' ), ] ); if( $response->upload->result === 'Success' ) { var_dump( $response ); echo "Done.\n"; } else { // what the fuuck? print_r( $response ); } // put in the log this shit - file_put_contents( 'log.out', "$img_id;$filename\n", FILE_APPEND ); + file_put_contents( 'upload.out', "$img_id;$filename\n", FILE_APPEND ); // wait to do not use the bot flag sleep( 5 ); } } catch( Exception $e ) { printf( "%s: %s", get_class( $e ), $e->getMessage() ); - file_put_contents( 'log.out.err', $e->getMessage(), FILE_APPEND ); + file_put_contents( 'upload.out.err', $e->getMessage(), FILE_APPEND ); } } /* // structured data ID $commons_structured_id = "M{$commons_page_id}"; // now the page exists - fetch the Entity ID $commons_structured = $commons->fetchSingleEntity( $commons_structured_id ); // $commons_structured->hasClaimsInProperty(); + + // source URL + // Property:P854 + + // Inscription + // Property:P1684 */ } // show missing Commons creator if( $MISSING_COMMONS_CREATOR ) { print_r( $MISSING_COMMONS_CREATOR ); } fwrite( $write_report_wiki, "|}\n" ); fclose( $write_report_csv ); fclose( $write_report_wiki );