diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/README.md b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/README.md index 8ae1602..1b1663c 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/README.md +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/README.md @@ -1,53 +1,80 @@ # Importer of multimedia files from https://iconoteca.arc.usi.ch/ ## Description Welcome in the importer of multimedia files for https://iconoteca.arc.usi.ch/. For more information about the consensus: https://it.wikipedia.org/wiki/Wikipedia:Raduni/Biblioteca_dell%27Accademia_di_Mendrisio_4_ottobre_2020 Example: https://commons.wikimedia.org/w/index.php?title=File:Arnolfo_di_Cambio._Busto_di_Bonifacio_VIII_presso_le_Grotte_vaticane.jpg&action=submit ## Installation From this directory: ``` git clone https://github.com/phpquery/phpquery ``` ## Usage ## -First download locally one of their collections: +First download locally one of their collections. + +Note that the server does not allow more than ~2000 images at time. So do it in two tranches using some available HTTP parameters. For example: ``` -wget https://iconoteca..../collection-asd.html +wget 'https://iconoteca.arc.usi.ch/it/ricerca?isPostBack=1&id_fondo=212&start=0&step=2000' +wget 'https://iconoteca.arc.usi.ch/it/ricerca?isPostBack=1&id_fondo=212&start=2000&step=2000' ``` Then you can examine that HTML page and bulk-download the available images from it: ``` ./parse-html-and-import.php collection-asd.html ``` The you can bulk-upload your files just selecting your directory with the images/metadata and selecting a template: ``` ./upload.php images/ template/collezione-biblioteca.php ``` -Happy hacking! +Here all the options of the `upload.php` script: + +``` +Usage: + ./upload.php [OPTIONS] path/data/ path/template/name.tpl + +Allowed OPTIONS: + --porcelain Do nothing + --preview Show a preview of the saved wikitext + --force-upload Force a re-upload even if the page exists + --no-report Don't create a report + --no-update Do not try to update something that already exists + --start-from=VALUE Start from a specific row (default to 1) + --limit=VALUE Process only this number of results + --nick=VALUE Nickname used to prefix indexes + --help|-h Show this help and quit +``` + +You can make a script to process a limited batch: + +``` +./process-corboz.sh +``` + +In short. Happy hacking! ## License Copyright (C) 2020 Valerio Bozzolan This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/bootstrap.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/bootstrap.php index 887b837..a3f4bbf 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/bootstrap.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/bootstrap.php @@ -1,96 +1,98 @@ . // require two dummy classes require 'include/functions.php'; require 'include/class-Metadata.php'; require 'include/class-MetadataValue.php'; // base URL to be scraped define( 'BASE_URL', 'https://iconoteca.arc.usi.ch' ); // inventory prefix to be stripped out to read the image ID (note the double slash! asd) define( 'INVENTORY_PREFIX_TO_STRIP', BASE_URL . '//thumb.php?inventario=' ); // URL to the single photo from the image ID (DOI) define( 'INVENTORY_URL_FORMAT', BASE_URL . '/it/inventario/%d' ); // URL of the high quality image define( 'HIGH_QUALITY_IMAGE_URL', BASE_URL . '/image-viewer.php?inventario=%d' ); // URL of the high quality image define( 'LOW_QUALITY_IMAGE_URL', BASE_URL . '/image_permission_show.php?inventario=%d' ); // image download name (with image ID) define( 'IMAGE_DOWNLOAD_NAME', 'images/%d.jpg' ); // array of metadatas displayed in the body in the '.metadati' selector // basically they are the labels displayed in the body on every image like this one: // https://iconoteca.arc.usi.ch/it/inventario/51630 $METADATA_BODY = [ new Metadata( 'Luogo rappresentato' ), new Metadata( 'Tipologia di risorsa' ), new Metadata( 'Creatore' ), new Metadata( 'Data' ), new Metadata( 'DOI', function ( $p ) { // the DOI is a link, so just extract the URL // text displayed after the label (manually stripping the label) return $p->find( 'a' )->attr( 'href' ); } ), new Metadata( 'ID immagine' ), new Metadata( 'Licenza', function( $p ) { // the License is a link, so just extract the URL // text displayed after the label (manually stripping the label) return $p->find( 'a' )->attr( 'href' ); } ), ]; // array of metadatas displayed in the footer in the '.metadati_completi' selector // basically they are the labels displayed in the footer on every image like this one: // https://iconoteca.arc.usi.ch/it/inventario/51630 $METADATA_FOOTER = [ new Metadata( 'Titolo opera' ), new Metadata( 'Titolo originale' ), + new Metadata( 'Descrizione testuale' ), new Metadata( 'Iscrizione' ), new Metadata( 'Collezione' ), new Metadata( 'Data creazione' ), new Metadata( 'Luogo creazione' ), new Metadata( 'Nome creatore' ), new Metadata( 'Descrittori Sbt' ), new Metadata( 'Descrittori Getty AAT' ), new Metadata( 'Luogo rappresentato', function( $p ) { // take just the text inside the link return $p->find( 'a' )->text(); } ), new Metadata( 'Classificazione' ), new Metadata( 'Tipo materiale' ), new Metadata( 'Designazione specifica del materiale' ), new Metadata( 'Supporto originale' ), new Metadata( 'Materiale del supporto' ), new Metadata( 'Nome oggetto culturale' ), new Metadata( 'Colore' ), new Metadata( 'Polarità' ), new Metadata( 'Tipo supporto' ), new Metadata( 'Processo e tecnica' ), new Metadata( 'Montaggio' ), new Metadata( 'Orientamento e forma' ), new Metadata( 'Dimensioni' ), ]; + diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/download-results.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/download-results.php new file mode 100755 index 0000000..0737d5c --- /dev/null +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/download-results.php @@ -0,0 +1,125 @@ +#!/usr/bin/php +. + +// load common files +require 'bootstrap.php'; + +// process this number of files at time +$BATCH_SIZE = 500; + +// pathname to the download directory +$DOWNLOAD_DIR = "./downloads"; + +// phpQuery1 +// https://github.com/phpquery/phpquery +$PHPQUERY = __DIR__ . '/phpquery/phpQuery/phpQuery.php'; +require $PHPQUERY; + +// no command line no party +if( !$argv ) { + echo "Not in command line?\n"; + exit( 1 ); +} + +// no first argument no party +$COLLECTION_ID = $argv[1] ?? null; +$COLLECTION_NICK = $argv[2] ?? null; +$TEMPLATE_PATH = $argv[3] ?? null; + +if( !$COLLECTION_ID ) { + echo "Error: missing COLLECTION_ID\n"; +} + +if( !$COLLECTION_NICK ) { + echo "Error: missing COLLECTION_NICKNAME\n"; +} + +if( !$COLLECTION_PATH ) { + echo "Warning: missing COLLECTION_PATH\n"; +} + +if( !$COLLECTION_ID || !$COLLECTION_NICK ) { + echo "Usage:\n"; + echo " {$argv[0]} COLLECTION_ID COLLECTION_NICKNAME COLLECTION_PATH\n"; + exit( 2 ); +} + +// number of search results +$search_results = null; + +// HTTP query for the search home +$base_url = 'https://iconoteca.arc.usi.ch/it/ricerca'; +$home_url = "$base_url?" . http_build_query( [ + 'isPostBack' => '1', + 'id_fondo' => $COLLECTION_ID, +] ); + +// read the file content +$content = file_get_contents( $home_url ); + +// parse the document +$document = phpQuery::newDocument( $content ); + +// enter in page content +foreach( pq( $document )->find( '.paginazione' ) as $pagination ) { + $pagination_text = $pagination->textContent; + $pagination_text = str_replace( 'Risultati:', '', $pagination_text ); + $search_results = (int) trim( $pagination_text ); + break; +} + +$commands = []; + +for( $position = 0; $position < $search_results; $position = $position + $BATCH_SIZE ) { + + // page with some results + $page_url = "$base_url?" . http_build_query( [ + 'isPostBack' => '1', + 'id_fondo' => $COLLECTION_ID, + 'start' => $position, + 'step' => $BATCH_SIZE, + ] ); + + // page HTML content + echo "Downloading $page_url ...\n"; + $page_content = file_get_contents( $page_url ); + + // filename that will be writte in download + $page_file_name = "collection-$COLLECTION_NICK-$COLLECTION_ID-from-$position-to-$BATCH_SIZE.html"; + + // full path of the above + $page_file_path = "$DOWNLOAD_DIR/$page_file_name"; + + file_put_contents( $page_file_path, $page_content ); + + // show what we will do + $command_parts = [ + './parse-html-and-import.php', + escapeshellarg( $page_file_path ), + ]; + + $command = implode( ' ', $command_parts ); + + + $commands[] = $command; +} + +foreach( $commands as $command ) { + + echo $command . "\n"; + +} diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php index 52b3cd5..9a97143 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/include/functions.php @@ -1,400 +1,428 @@ . /** * Find a matching metadata from a label and return a MetadataValue * * @param array $metadatas Array of known metadatas * @param string $label Original label like 'Titolo originale:' * @param string $value Value related to the matching metadata * @return MetadataValue|false Matching metadata or false if not found */ function find_matching_metadatavalue_from_label( $metadatas, $label, $value ) { // find the matching metadata foreach( $metadatas as $metadata ) { if( $metadata->matchesLabel( $label ) ) { return $metadata->createValue( $value ); } } // no metadata no party return false; } /** * Print a message */ function message( $message ) { printf( "[%s] %s\n", date( 'Y-m-d H:i:s' ), $message ); } /** * Covert an HTML link to a wikitext one */ function html_link_2_wikitext( $txt ) { return preg_replace_callback( '@(.+?)@', function( $matches ) { // eventually make the URL absolute $url = $matches[1]; if( $url[0] === '/' ) { $url = BASE_URL . $url; } return sprintf( '[%s %s]', $url, $matches[2] ); }, $txt ); // Sculture } /** * Require a certain page from the template directory * * It will eventually echo something. * * @param $name string page name * @param $args mixed arguments to be passed to the page scope */ function template( $template, $template_args = [] ) { extract( $template_args, EXTR_SKIP ); return require $template; } /** * Get the template output * * It will echo nothing. * * @param $name string page name (to be sanitized) * @param $args mixed arguments to be passed to the page scope * @see template() * @return string The template output */ function template_content( $name, $args = [] ) { ob_start(); template( $name, $args ); $text = ob_get_contents(); ob_end_clean(); return $text; } /** * Generator of name variants * * @param string $name * @return string */ function generator_name_variants( $name ) { // "Anderson, Domenico" → "Domenico Anderson" $parts = explode( ", ", $name ); if( count( $parts ) === 2 ) { yield "{$parts[1]} {$parts[0]}"; yield "{$parts[0]} {$parts[1]}"; } else { yield $name; } } /** * Generate the first name variant */ function first_name_variant( $name ) { foreach( generator_name_variants( $name ) as $variant ) { return $variant; } return $name; } /** * Search an Author in Wikidata * * @param string $name * @return string Q-ID or NULL */ function search_author_in_wikidata( $name ) { // terms that can be found in a description to classify someone $TERMS = [ 'photographer', 'fotograf', ]; $all = [ 'good' => [], 'undetected' => [], ]; // generate a list of possible query terms foreach( generator_name_variants( $name ) as $variant ) { // query Wikidata results using wbsearcentity API $candidates = find_wikidata_entity_by_title( $variant ); foreach( $candidates as $id => $description ) { // check if this Wikidata result matches one of the well-known terms $found = is_term_found( $TERMS, $description ); // index in the right namespace $key = $found ? 'good' : 'undetected'; $all[ $key ][ $id ] = $description; } // if something good was found, quit earlier with the first ID. otherwise continue if( count( $all[ 'good' ] ) === 1 ) { return array_keys( $all[ 'good' ] )[0]; } } // check if something was undetected if( $all[ 'undetected' ] ) { print_r( $all ); throw new Exception( "undetected. TODO: pick" ); } if( $all[ 'good' ] ) { print_r( $all ); throw new Exception( "detected multiple good possibilities. TODO: pick" ); } // nothing found return null; } /** * Search a Creator in Commons * * @param string $name * @return string Q-ID or NULL */ function search_creator_on_commons( $name ) { $wiki = \wm\Commons::instance(); // generate a list of possible query terms foreach( generator_name_variants( $name ) as $variant ) { $title = "Creator:$variant"; - if( wiki_page_id( $wiki, $title ) ) { + if( wiki_page_id_cached( $wiki, $title ) ) { return $title; } } return false; } /** * Search something in Wikidata * * @param string $search * @return array Associative array of ID and its description */ function find_wikidata_entity_by_title( $search ) { $founds = []; $wikidata = \wm\Wikidata::instance(); // https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities $queries = $wikidata->createQuery( [ 'action' => 'wbsearchentities', 'search' => $search, 'language' => 'it', 'type' => 'item', ] ); // loop queries foreach( $queries as $query ) { $results = $query->search ?? []; foreach( $results as $result ) { $id = $result->id; $description = $result->description; // store as ID => description $founds[ $id ] = $description; } } return $founds; } /** * Test an array of terms and return true if one is found in the subject * * @param array $terms * @param string $subject * @return bool */ function is_term_found( $terms, $subject ) { // for each term foreach( $terms as $term ) { // check if the term is part of the subject if( strpos( $subject, $term ) !== false ) { // gotcha! return true; } } return false; } /** * Check if a page exists * * @return int|false */ function wiki_page_id( $wiki, $title ) { $result = $wiki->fetch( [ 'action' => 'query', 'prop' => 'info', 'titles' => $title, ] ); $pages = $result->query->pages ?? []; foreach( $pages as $page ) { // no page no party - if( isset( $page->missing ) ) { + if( isset( $page->missing ) || $page->pageid < 0 ) { return false; } // that's OK return $page->pageid; } // no page no party throw new Exception( "what" ); } +/** + * Check if a page exists + * + * @return int|false + */ +function wiki_page_id_cached( $wiki, $title ) { + + static $cache = []; + + $wiki_uid = $wiki::UID; + $key = "$wiki_uid-$title"; + if( !isset( $cache[ $key ] ) ) { + $cache[ $key ] = wiki_page_id( $wiki, $title ); + } + + return $cache[ $key ]; +} + /** * Parse "27x20 cm" and return a {{Size}} template if possible * * @return string */ function parse_size( $size ) { $found = preg_match( '/^ *([0-9]+)x([0-9]+) *([a-zA-Z]+) *$/', $size, $matches ); if( $found ) { $w = $matches[ 1 ]; $h = $matches[ 2 ]; $unit = $matches[ 3 ]; return "{{Size|unit = $unit |width = $w |height = $h}}"; } return $size; } /** * Parse an Italian date to a Commons date */ function italian_date_2_commons( $date ) { $date_lower = strtolower( $date ); $MESI = [ 'gennaio' => "01", 'febbraio' => "02", 'marzo' => "03", 'aprile' => "04", 'maggio' => "05", 'giugno' => "06", 'luglio' => "07", 'agosto' => "08", 'settembre' => "09", 'ottobre' => "10", 'novembre' => "11", 'dicembre' => "12", ]; // Maggio 2000 -> '2000-05' $found_month_year = preg_match( '/^([a-z]+) +([0-9]{4})$/', $date_lower, $matches ); if( $found_month_year ) { $month = strtolower( $matches[1] ); $year = $matches[2]; $month_numeric = $MESI[ $month ] ?? null; if( $month_numeric ) { return "$year-$month_numeric"; } } // 1999-2000 $found_year_range = preg_match( '/^([0-9]{4})-([0-9]{4})$/', $date, $matches ); if( $found_year_range ) { $start = $matches[1]; $end = $matches[2]; return "{{Other date|-|$start|$end}}"; } // Prima del 1893 $before_year = preg_match( '/^prima +del +([0-9]{4})$/', $date_lower, $matches ); if( $before_year ) { $year = $matches[1]; return "{{Other date|<|$year}}"; } // Prima del settembre 1871 $before_month_year = preg_match( '/^prima +(del |di |dell\') *([a-z]+) +([0-9]{4})$/', $date_lower, $matches ); if( $before_month_year ) { $prep = $matches[1]; $month = $matches[2]; $year = $matches[3]; $month_numeric = $MESI[ $month ] ?? null; if( $month_numeric ) { return "{{Other date|<|$year-$month_numeric}}"; } } return $date; } /** * Convert an Italian tequique to a {{Teqnique}} Wikimedia Commons template */ function italian_technique_2_commons_template( $description ) { $description_low = strtolower( $description ); if( strpos( $description_low, "procedimento all'albumina" ) !== -1 ) { return "{{Technique|albumen print}}"; } if( strpos( $description_low, "gelatina ai sali d'argento" ) !== -1 ) { return "{{Technique|albumen silver print}}"; } return $description; } + +/** + * Check if a page ID is very recent + */ +function is_page_id_very_recent( $pageid ) { + + $VERY_RECENT_PAGE_ID = 97812869; + + return $pageid >= $VERY_RECENT_PAGE_ID; +} diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/parse-html-and-import.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/parse-html-and-import.php index e881cf4..06b06f4 100755 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/parse-html-and-import.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/parse-html-and-import.php @@ -1,219 +1,222 @@ #!/usr/bin/php . // load common files require 'bootstrap.php'; // metadata by selector $METADATA_BY_SELECTOR = [ '.metadati' => $METADATA_BODY, '.metadati_completi' => $METADATA_FOOTER, ]; +// reports base directory +$REPORTS_BASE_PATH = "./reports"; + // phpQuery1 // https://github.com/phpquery/phpquery $PHPQUERY = __DIR__ . '/phpquery/phpQuery/phpQuery.php'; // no phpQuery no party if( !file_exists( $PHPQUERY ) ) { echo "Please read the README\n"; exit( 5 ); } // load phpQuery require $PHPQUERY; // no command line no party if( !$argv ) { echo "Not in command line?\n"; exit( 1 ); } // no first argument no party $page = $argv[1] ?? null; if( !$page ) { echo "Usage:\n {$argv[0]} FILE.html\n"; exit( 2 ); } // no file no party if( !file_exists( $page ) ) { echo "Unexisting file $page\n"; exit( 3 ); } // read the file content $content = file_get_contents( $page ); // no content no party if( !$content ) { echo "No content no party\n"; exit( 4 ); } // parse the document $document = phpQuery::newDocument( $content ); // enter in page content $content = pq( $document )->find( '.page-content' ); // traverse the DOM tree foreach( $content->find( '.row' ) as $row ) { foreach( pq( $row )->find( '.col-md-4' ) as $col ) { // image element $img = pq( $col )->find( 'img' ); // image relative path in the URL $img_path = $img->attr( 'src' ); // no URL no party (wrong elements) if( !$img_path ) { continue; } // absolute image URL $img_url = BASE_URL . '/' . $img_path; // image identifier $img_id = str_replace( INVENTORY_PREFIX_TO_STRIP, '', $img_url ); // it's an integer $img_id = (int) $img_id; // image permalink $img_page_url = sprintf( INVENTORY_URL_FORMAT, $img_id ); // image permalink HTMl content message( "Sucking $img_page_url..." ); $img_page_content = file_get_contents( $img_page_url ); if( !$img_page_content ) { message( "Skip failed download $img_page_url" ); continue; } // parse image permalink page $img_page = pq( phpQuery::newDocument( $img_page_content ) ); // image data read $img_metadata_values = []; // loop all the possible metadatas finding them from the right selector foreach( $METADATA_BY_SELECTOR as $metadata_selector => $possible_metadatas ) { // parse image body metadatas section foreach( $img_page->find( $metadata_selector ) as $img_metadata ) { // traverse all the paragraphs containing metadatas and try to parse foreach( pq( $img_metadata )->find( 'p' ) as $img_metadata_p_raw ) { // paragraph element $img_metadata_p = pq( $img_metadata_p_raw ); // label // it contains 'Titolo originale:' $img_metadata_p_label = $img_metadata_p->find( 'label' ); // label text // e.g. 'Titolo originale:' $img_metadata_p_label_txt = $img_metadata_p_label->text(); // metadata matching this label $img_metadata_value = find_matching_metadatavalue_from_label( $possible_metadatas, $img_metadata_p_label_txt, $img_metadata_p ); // gotcha? if( $img_metadata_value ) { $img_metadata_values[] = $img_metadata_value; } else { message( "Unknown metadata '$img_metadata_p_label_txt' not found in $metadata_selector" ); } } } } // main image $img_main = $img_page->find( '.zoomviewer img' ); // hight quality image URL $img_hq_url = sprintf( HIGH_QUALITY_IMAGE_URL, $img_id ); // low quality image URL $img_lq_url = sprintf( LOW_QUALITY_IMAGE_URL, $img_id ); // image pathname $img_path = sprintf( IMAGE_DOWNLOAD_NAME, $img_id ); // build a metadata file $img_path_json = "$img_path.json"; $img_data_json = []; foreach( $img_metadata_values as $img_metadata_value ) { message( " $key: $value" ); list( $key, $value ) = $img_metadata_value->getData(); $img_data_json[ $key ] = $value; } // no json write no party if( !file_put_contents( $img_path_json, json_encode( $img_data_json, JSON_PRETTY_PRINT ) ) ) { message( "cannot write $img_path_json" ); } foreach( [ $img_hq_url, $img_lq_url ] as $img_url ) { // eventually download the image and save if( !file_exists( $img_path ) ) { message( "Fetching $img_url in $img_path..." ); // download the image $img_bin = file_get_contents( $img_url ); // sometime this is not an image but is a shitty text // «ERRORE: il livello d'accesso impostato al file non consente di scaricare questa immagine» ASD if( strlen( $img_bin ) > 1000 ) { // save the HQ image or write an error if( !file_put_contents( $img_path, $img_bin ) ) { message( "cannot write $img_path" ); } } else { // WHAAT THE FUUUUUCK IS THIS SHIT message( "invalid image" ); } } } // all right message( "completed $img_id" ); } } diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/process-corboz.sh b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/process-corboz.sh new file mode 100755 index 0000000..487e77e --- /dev/null +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/process-corboz.sh @@ -0,0 +1,8 @@ +ARGS="$@" +./upload.php $ARGS --start-from=1 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php +./upload.php $ARGS --start-from=600 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php +./upload.php $ARGS --start-from=1200 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php +./upload.php $ARGS --start-from=1800 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php +./upload.php $ARGS --start-from=2400 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php +./upload.php $ARGS --start-from=3000 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php +./upload.php $ARGS --start-from=3600 --limit=600 --nick=corboz ./images-corboz/ ./template/collezione-corboz.php diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php index 7cc629c..4ab1bc2 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php @@ -1,79 +1,80 @@ . // this is a template to build a generic Commons file description // this is not used, same as description ?> =={{int:filedesc}}== {{Art photo |photographer = |title = |description = |date = |source = |medium = |dimensions = |accession number = |place of creation = |institution = {{Institution:Iconoteca dell'Accademia di architettura di Mendrisio}} -|department = [https://iconoteca.arc.usi.ch/it/collezione/193 Collezione Biblioteca] +|department = + |inscriptions = { 'Iscrizione' } ) ) { // sometime they put "asasdd" in quotes, so strip them $METADATA->{ 'Iscrizione' } = trim( $METADATA->{ 'Iscrizione' }, '"' ); printf( "{{Inscription|1 = %s}}", $METADATA->{'Iscrizione'} ); } ?> |permission = |other versions = }} == {{int:metadata}} == {| class="wikitable" $value ) { // line row if( !$first ) { echo "|-\n"; } echo "! $key\n"; echo "| $value\n"; $first = false; } ?> |} [[Category:Collezione Biblioteca - Iconoteca dell'architettura in Mendrisio, Switzerland]] diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-corboz.php similarity index 91% copy from 2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php copy to 2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-corboz.php index 7cc629c..fa68eb4 100644 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-biblioteca.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/template/collezione-corboz.php @@ -1,79 +1,80 @@ . // this is a template to build a generic Commons file description // this is not used, same as description ?> =={{int:filedesc}}== {{Art photo |photographer = |title = |description = |date = |source = |medium = |dimensions = |accession number = |place of creation = |institution = {{Institution:Iconoteca dell'Accademia di architettura di Mendrisio}} -|department = [https://iconoteca.arc.usi.ch/it/collezione/193 Collezione Biblioteca] +|department = + |inscriptions = { 'Iscrizione' } ) ) { // sometime they put "asasdd" in quotes, so strip them $METADATA->{ 'Iscrizione' } = trim( $METADATA->{ 'Iscrizione' }, '"' ); printf( "{{Inscription|1 = %s}}", $METADATA->{'Iscrizione'} ); } ?> |permission = |other versions = }} == {{int:metadata}} == {| class="wikitable" $value ) { // line row if( !$first ) { echo "|-\n"; } echo "! $key\n"; echo "| $value\n"; $first = false; } ?> |} -[[Category:Collezione Biblioteca - Iconoteca dell'architettura in Mendrisio, Switzerland]] +[[Category:Collezione Corboz - Iconoteca dell'architettura in Mendrisio, Switzerland]] diff --git a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php index 1056fc5..c9da8f8 100755 --- a/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php +++ b/2020-10-04-mendrisio-sucker-iconoteca.arc.usi.ch/upload.php @@ -1,475 +1,548 @@ #!/usr/bin/php . // autoload framework require __DIR__ . '/../includes/boz-mw/autoload.php'; // require some values require 'bootstrap.php'; // load configuration file or create one cli\ConfigWizard::requireOrCreate( __DIR__ . '/../config.php' ); // https://commons.wikimedia.org/wiki/Commons:Bots/Requests/Valerio_Bozzolan_bot_(6) $COMMONS_CONSENSUS_PAGE = "[[Commons:Bots/Requests/Valerio Bozzolan bot (6)|authorized import from Academy of architecture Library of Mendrisio]]"; // load Wikimedia Commons $commons = \wm\Commons::instance(); use \cli\Log; use \cli\Input; use \cli\Opts; use \cli\ParamValuedLong; use \cli\ParamFlag; use \cli\ParamFlagLong; // register all CLI parameters $opts = new Opts( [ new ParamFlagLong( 'porcelain', "Do nothing" ), new ParamFlagLong( 'preview', "Show a preview of the saved wikitext" ), new ParamFlagLong( 'force-upload', "Force a re-upload even if the page exists" ), + new ParamFlagLong( 'no-report', "Don't create a report" ), + new ParamFlagLong( 'no-update', "Do not try to update something that already exists" ), new ParamValuedLong( 'start-from', "Start from a specific row (default to 1)" ), new ParamValuedLong( 'limit', "Process only this number of results" ), + new ParamValuedLong( 'nick', "Nickname used to prefix indexes" ), new ParamFlag( 'help', 'h', "Show this help and quit" ), ] ); // arguments $unnamed_opts = Opts::unnamedArguments(); $dir = $unnamed_opts[0] ?? null; $template = $unnamed_opts[1] ?? null; // porcelain mode means that nothing have to be saved $PORCELAIN = $opts->getArg( 'porcelain' ); +// option to do not create a report +$WRITE_REPORT = ! $opts->getArg( 'no-report' ); + +// option to do not update something already existing +$NO_UPDATE = $opts->getArg( 'no-update' ); + // get a preview of the current wikitext $PREVIEW = $opts->getArg( 'preview' ); // check if you have to force the upload $FORCE_UPLOAD = $opts->getArg( 'force-upload' ); // start from this row $START_FROM = $opts->getArg( 'start-from' ); +// nickname used to prefix indexes +$NICK = $opts->getArg( 'nick', 'mendrisio' ); + // limit to this number of results $LIMIT = $opts->getArg( 'limit' ); +// suffix for the reports +$REPORT_SUFFIX = ''; +if( $START_FROM ) { + $REPORT_SUFFIX .= "-from-$START_FROM"; +} +if( $LIMIT ) { + $REPORT_SUFFIX .= "-to-$LIMIT"; +} + // show the help $show_help = $opts->getArg( 'help' ); // no dir no party if( !$dir || !$template ) { $show_help = true; } // show an help message if( $show_help ) { echo "Usage:\n {$argv[ 0 ]} [OPTIONS] path/data/ path/template/name.tpl\n\n"; echo "Allowed OPTIONS:\n"; $opts->printParams(); exit( $opts->getArg( 'help' ) ? 0 : 1 ); } // missing creators in Wikimedia Commons $MISSING_COMMONS_CREATOR = []; $file_pattern = $dir . '/' . '*.jpg'; // doi_by_name $doi_by_name = []; // array of duplicate DOIs $duplicate_dois = []; // duplicate SHA1 of filenames $duplicate_sha1 = []; // scan the directory - this is written as-is to do not disturb GNU nano with slash and star. asd foreach( glob( $file_pattern ) as $file ) { // no data no party $file_data = @file_get_contents( "$file.json" ); $file_data = json_decode( $file_data ); if( !$file_data ) { echo "skip $file missing data\n"; continue; } // check available metadata $img_id = $file_data->{"ID immagine"}; $title = $file_data->{"Titolo opera"}; - // DOI by name if( empty( $doi_by_name[ $title ] ) ) { $doi_by_name[ $title ] = []; } else { $duplicate_dois[] = $img_id; } $doi_by_name[ $title ] [] = $img_id; // title by SHA1 $sha1 = sha1_file( $file ); if( empty( $duplicate_sha1[ $sha1 ] ) ) { $duplicate_sha1[ $sha1 ] = []; } $duplicate_sha1[ $sha1 ][] = "$title - DOI $img_id"; } // find duplicates $duplicates = false; foreach( $doi_by_name as $title => $dois ) { // a title should be unique by DOI if( count( $dois ) > 1 ) { Log::warn( sprintf( "found duplicate title '%s' DOIs %s", $title, implode( ', ', $dois ) ) ); $duplicates = true; } } foreach( $duplicate_sha1 as $sha1 => $titles ) { if( count( $titles ) > 1 ) { Log::warn( sprintf( "found duplicate file '%s'", implode( ', ', $titles ) ) ); $duplicates = true; } } -// login in Commons -$commons->login(); - // columns to be displayed in the report $REPORT_COLUMNS = [ 'N', 'FILE_WLINK', 'FILE_THUMB', 'TITLE', 'DESCRIPTION', 'LICENSE', 'DATE', 'AUTHOR', - 'CREATOR_COMMONS_LINK', - 'SIZE_TEMPLATE', +// 'CREATOR_COMMONS_LINK', + 'SIZE', +// 'SIZE_TEMPLATE', 'MEDIUM', - 'MEDIUM_TEMPLATE', 'PLACE_CREATION', 'DOI_ID', 'SOURCE', ]; -// write report -$write_report_csv = fopen( 'write-report.csv', 'w' ); -$write_report_wiki = fopen( 'write-report.wiki', 'w' ); +// login in Commons +$commons->login(); // columns to be displayed in the log $log_args = []; foreach( $REPORT_COLUMNS as $k => $v ) { $log_args[] = $v; } -// write reports -fputcsv( $write_report_csv, $log_args ); -fwrite( $write_report_wiki, "{| class=\"wikitable\"\n|-\n! " . implode( "\n! ", $log_args ) . "\n" ); +// write report +if( $WRITE_REPORT ) { + $write_report_csv = fopen( "$NICK-write-report{$REPORT_SUFFIX}.csv", 'w' ); + $write_report_wiki = fopen( "$NICK-write-report{$REPORT_SUFFIX}.wiki", 'w' ); + + // write headings + fputcsv( $write_report_csv, $log_args ); + fwrite( $write_report_wiki, "{| class=\"wikitable\"\n|-\n! " . implode( "\n! ", $log_args ) . "\n" ); +} $row = 0; $processeds = 0; // scan the directory - this is written as-is to do not disturb GNU nano with slash and star. asd foreach( glob( $file_pattern ) as $file ) { // no data no party $file_data = @file_get_contents( "$file.json" ); $file_data = json_decode( $file_data ); if( !$file_data ) { echo "skip $file missing data\n"; continue; } $row++; // start from this element if( $START_FROM ) { if( $row < $START_FROM ) { // skip continue; } else { // continue normally $START_FROM = null; } } // number of processed images $processeds++; // apply limit if( $LIMIT && $processeds > $LIMIT ) { Log::info( "reached limit of $LIMIT" ); - exit( 0 ); + break; } + Log::info( "$processeds [$row/$LIMIT]" ); + // check available metadata $img_id = $file_data->{"ID immagine"}; $title = $file_data->{"Titolo opera"}; $title_orig = $file_data->{"Titolo originale"} ?? null; $collection = $file_data->{"Collezione"} ?? null; $license = $file_data->{"Licenza"} ?? null; $type = $file_data->{"Tipologia di risorsa"} ?? "Fotografia"; $size = $file_data->{"Dimensioni"} ?? ''; $material = $file_data->{"Tipo materiale"} ?? null; $author_name = $file_data->{"Nome creatore"} ?? null; $author = $file_data->{"Creatore"} ?? $author_name; $date = $file_data->{"Data"} ?? $file_data->{"Data creazione"} ?? null; $process = $file_data->{"Processo e tecnica"} ?? null; $place = $file_data->{"Luogo creazione"} ?? null; // build the |medium= parameter // example: "Carta. Fatto cor culo." $medium_parts = []; if( $material ) { $medium_parts[] = $material; } if( $process ) { $medium_parts[] = $process; } $medium = implode( '. ', $medium_parts ); // obtain a {{Technique}} template parsing keywords from $medium $medium_template = italian_technique_2_commons_template( $process ); // check license $license_templates = ''; if( $license === 'https://creativecommons.org/licenses/by-sa/4.0/deed.it' ) { $license_templates .= "{{Cc-by-sa-4.0}}"; } else { throw new Exception( "unknown license $license" ); } // source URL $source_url = null; if( $img_id ) { $source_url = sprintf( INVENTORY_URL_FORMAT, $img_id ); } // drop nonsense authors if( $author === 'Autore non identificato' ) { $author = null; } if( $author === 'ignoto' ) { $author = null; } // creator on Wikimedia Commons $creator_commons = null; $creator_commons_template = null; $creator_commons_link = null; if( $author ) { $creator_commons = search_creator_on_commons( $author ); if( $creator_commons ) { $creator_commons_template = '{{' . $creator_commons . '}}'; $creator_commons_link = "[[$creator_commons]]"; } else { $author_possible_variant = first_name_variant( $author ); $creator_commons_template = $author_possible_variant; $MISSING_COMMONS_CREATOR[ $author_possible_variant ] = $MISSING_COMMONS_CREATOR[ $author_possible_variant ] ?? 0; $MISSING_COMMONS_CREATOR[ $author_possible_variant ]++; Log::warn( "missing Wikimedia Commons [[Creator:$author_possible_variant]]" ); } } // parse the image size $size_template = parse_size( $size ); + // clean the title from unsupported chars + $title_clean = $title; + $title_clean = str_replace( '&eamp;', 'e', $title_clean ); + $title_clean = str_replace( '&', 'e', $title_clean ); + // check if the filename exists - $filename = "$title.jpg"; + $filename = "$title_clean.jpg"; + + // you cannot insert an '&' in the title, replace with 'e' + $filename_complete = "File:$filename"; - $filename_unique = "$title (DOI $img_id).jpg"; + $filename_unique = "$title_clean (DOI $img_id).jpg"; $filename_unique_complete = "File:$filename_unique"; + // check if the Commons page exists + $commons_page_id = wiki_page_id( $commons, $filename_complete ); + if( $commons_page_id ) { + Log::info( " exists" ); + } else { + Log::info( " does not exists" ); + } + + // check if the page exists and is not recent + $exists_and_is_not_recent = false; + + // eventually check if the page is very recent + if( $commons_page_id ) { + if( is_page_id_very_recent( $commons_page_id ) ) { + + Log::info( " is recent (page ID $commons_page_id)" ); + + // assume that this is what we want + } else { + Log::info( " is old (page ID $commons_page_id)" ); + + // the page already exist but it's not recent (so it's not our stuff) + $exists_and_is_not_recent = true; + + // I've just said that this page ID is not relevant + $commons_page_id = false; + } + } + // if this is a duplicate, make the title unique - if( in_array( $img_id, $duplicate_dois ) ) { + if( in_array( $img_id, $duplicate_dois ) || $exists_and_is_not_recent ) { $filename = $filename_unique; $filename_complete = $filename_unique_complete; + + // search again the page ID + $commons_page_id = wiki_page_id( $commons, $filename_complete ); + } + + // print something in this case + if( $PORCELAIN ) { + Log::info( " processing [[$filename_complete]]" ); } // template arguments $template_args = [ 'N' => $row, 'FILE_THUMB' => "[[$filename_complete|100px]]", 'FILE_WLINK' => "[[:$filename_complete]]", - 'TITLE' => $title_orig ? "{{it|$title_orig}}" : '', - 'DESCRIPTION' => $title ? "{{it|$title}}" : '', + 'TITLE' => $title_orig ?? '', + 'DESCRIPTION' => $title ?? '', 'LICENSE' => $license, 'LICENSE_TEMPLATES' => $license_templates, 'DATE' => italian_date_2_commons( $date ), 'METADATA' => $file_data, 'DOI_ID' => $img_id, 'SOURCE' => $source_url, 'AUTHOR' => $author, 'CREATOR_COMMONS' => $creator_commons_template, 'CREATOR_COMMONS_LINK' => $creator_commons_link, + 'SIZE' => $size, 'SIZE_TEMPLATE' => $size_template, 'MEDIUM' => $medium, 'MEDIUM_TEMPLATE' => $medium_template, + 'COLLECTION' => $collection, 'PLACE_CREATION' => $place, ]; // build the page content $page_content = template_content( $template, $template_args ); // check if you want to show a preview if( $PREVIEW ) { echo "---\n"; echo $page_content; echo "---\n"; } // columns to be displayed in the log $log_args = []; foreach( $REPORT_COLUMNS as $column ) { $log_args[] = $template_args[ $column ]; } // write reports - fputcsv( $write_report_csv, $log_args ); - fwrite( $write_report_wiki, "|-\n| " . implode( "\n| ", $log_args ) . "\n" ); + if( $WRITE_REPORT ) { + fputcsv( $write_report_csv, $log_args ); + fwrite( $write_report_wiki, "|-\n| " . implode( "\n| ", $log_args ) . "\n" ); + } - // check if the Commons page exists - $commons_page_id = wiki_page_id( $commons, $filename_complete ); - if( $commons_page_id && !$FORCE_UPLOAD ) { - - // page exists - Log::info( sprintf( - "updating https://commons.wikimedia.org/wiki/%s", - rawurlencode( $filename_complete ) - ) ); + // check if we have to update the existing page + if( !$FORCE_UPLOAD && $commons_page_id ) { // eventually skip saving - if( !$PORCELAIN ) { + if( !$PORCELAIN && !$NO_UPDATE ) { - Log::info( sprintf( "Saving..." ) ); + // page exists + Log::info( sprintf( + "updating https://commons.wikimedia.org/wiki/%s", + rawurlencode( $filename_complete ) + ) ); // save // https://www.mediawiki.org/w/api.php?action=help&modules=parse $result = $commons->edit( [ 'title' => $filename_complete, 'text' => $page_content, 'summary' => "Bot: $COMMONS_CONSENSUS_PAGE", 'minor' => true, 'bot' => true, ] ); // eventually wait some time if something was changed if( isset( $result->edit->nochange ) ) { Log::info( "no change" ); } else { Log::info( "saved" ); sleep( 5 ); } } } else { - // print a message - Log::info( sprintf( - "try to upload https://commons.wikimedia.org/wiki/%s (DOI %s)", - rawurlencode( $filename_complete ), - $img_id - ) ); - // upload this damn image try { if( !$PORCELAIN ) { + // print a message + Log::info( sprintf( + "try to upload https://commons.wikimedia.org/wiki/%s (DOI %s)", + rawurlencode( $filename_complete ), + $img_id + ) ); + // https://www.mediawiki.org/w/api.php?action=help&modules=upload $response = $commons->upload( [ 'comment' => "Bot: $COMMONS_CONSENSUS_PAGE", 'text' => $page_content, 'filename' => $filename, 'ignorewarnings' => $FORCE_UPLOAD, \network\ContentDisposition::createFromNameURLType( 'file', $file, 'image/jpg' ), ] ); if( $response->upload->result === 'Success' ) { var_dump( $response ); echo "Done.\n"; } else { // what the fuuck? print_r( $response ); } // put in the log this shit - file_put_contents( 'upload.out', "$img_id;$filename\n", FILE_APPEND ); + file_put_contents( "$NICK-upload.out", "$img_id;$filename\n", FILE_APPEND ); // wait to do not use the bot flag sleep( 5 ); } } catch( Exception $e ) { printf( "%s: %s", get_class( $e ), $e->getMessage() ); - file_put_contents( 'upload.out.err', $e->getMessage(), FILE_APPEND ); + file_put_contents( "$NICK-upload.out.err", $e->getMessage(), FILE_APPEND ); } } /* // structured data ID $commons_structured_id = "M{$commons_page_id}"; // now the page exists - fetch the Entity ID $commons_structured = $commons->fetchSingleEntity( $commons_structured_id ); // $commons_structured->hasClaimsInProperty(); // source URL // Property:P854 // Inscription // Property:P1684 */ } // show missing Commons creator if( $MISSING_COMMONS_CREATOR ) { print_r( $MISSING_COMMONS_CREATOR ); } -fwrite( $write_report_wiki, "|}\n" ); +// close reports +if( $WRITE_REPORT ) { + fwrite( $write_report_wiki, "|}\n" ); -fclose( $write_report_csv ); -fclose( $write_report_wiki ); + fclose( $write_report_csv ); + fclose( $write_report_wiki ); +}