User:Melancholie/vernacular.pl

#!/usr/bin/perl
# Dump source: http://download.wikimedia.org/specieswiki/latest/specieswiki-latest-pages-articles.xml.bz2
# Script usage: perl vernacular.pl 500 cs

$items = $ARGV[0];
unless (defined $items) {$items = 500;}
$lang = $ARGV[1];
unless (defined $lang) {$lang = "cs";}
$input = "specieswiki-latest-pages-articles.xml";
unless (-f $input) {print "\n ERROR: Input file not found!\n";}

$item = 0;
$template = 0;
open IN, $input;
open OUT, ">missingVernacularNames.txt";
while (<IN>) {
 if ($item eq $items) {last;}
 if ($_ =~ /<title>([^<]+)<\/title>/) {$title = $1;}
 if ($title =~ /^(Wikispecies|Image|Category|Template|MediaWiki|Help|Talk|User|Main Page)(:|$)/ || $title =~ / talk:/) {next;}
 if ($_ =~ /<text xml:space="preserve">/) {$textArea = 1;}
 if ($textArea) {
  if ($_ =~ /\# *REDIRECT/i) {$redirect = 1;}
  if ($_ =~ /\{\{ *(msg|template)?:? *(VN|Vernacular|VN2) *(\||$)/i) {$hasVN = 1; $hadVN = 1;}
  if ($hasVN && $_ =~ /\{{2,3}/) {$template++;}
  if ($hasVN && $_ =~ /\}{2,3}/) {$template = $template-1;}
  if ($hasVN && $template eq 0 && $_ =~ /\}\}/) {undef($hasVN);}
  if ($hasVN && $_ =~ /(^|\|) *$lang *=/) {$hasVernacularName = 1;}
 }
 if ($_ =~ /<\/text>/) {
  if (!$hasVernacularName && !$redirect) {
   $item++;
   if ($hadVN) {
    print OUT "\#[[$title]] (cs missing)\n";
    undef($hadVN); undef($hasVN);
   } else {
    print OUT "\#[[$title]] (VN missing)\n";
   }
  } else {
   undef($hasVernacularName);
  }
  $template = 0;
  undef($redirect);
  undef($textArea);
 }
}
close OUT;
close IN;