User:Ruakh/count-L2-headers.pl

#!/usr/bin/perl -w

# Copyright 2012 Ran Ari-Gur.

#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   For the text of the GNU General Public License, see
#   <http://www.gnu.org/licenses/>.


# You can contact the author via
# <http://en.wiktionary.org/wiki/User_talk:Ruakh>.


# This Perl script reads in a MediaWiki XML dump (either still compressed as a
# *.xml.bz2 file, or decompressed as a *.xml file -- the latter being
# significantly faster), examines all the mainspace entries, and prints out a
# wiki-formatted list of all L2 headers, together with the number of times that
# each one occurs. The output is encoded in UTF-8, so should be redirected to a
# file rather than printed to the console.


use warnings;
use strict;

use IO::Uncompress::Bunzip2 qw/$Bunzip2Error/;

if(@ARGV == 0)
  { die "Usage: count-L2-headers.pl DUMP_FILENAME > RESULTS.txt\n"; }

my $dump;
if($ARGV[0] =~ m/\.bz2$/)
{
  $dump = IO::Uncompress::Bunzip2->new($ARGV[0])
    or die "Could not open '$ARGV[0]' as a .bz2 stream: $Bunzip2Error\n";
}
else
{
  open $dump, '<', $ARGV[0]
    or die "Could not open '$ARGV[0]' for reading: $!\n";
  my $lt;
  read $dump, $lt, 1;
  if($lt ne '<')
    { die "'$ARGV[0]' does not begin with '<'.\n"; }
}

my %namespaces;

while(<$dump>)
{
  last if m/<\/namespaces>/;

  if(m/<namespace\s[^>]*>([^<]+)<\/namespace>/)
    { $namespaces{$1} = 1; }
}

my $title;
my $in_text = '';

my %counts;

while(<$dump>)
{
  if($in_text)
  {
    if(s/<\/text>\s*$//)
      { $in_text = ''; }
    if(m/^==(?!=.*===\s*$)\s*(.*\S)\s*==\s*$/)
    {
      my $L2 = $1;
      if($title !~ m/^([^:]+):/ || ! $namespaces{$1})
        { ++$counts{$L2}; }
    }
  }
  else
  {
    if(m/^\s*<title>([^<]+)<\/title>\s*$/)
      { $title = $1; }
    elsif(s/^\s*<text xml:space="preserve">//)
    {
      $in_text = 1;
      redo;
    }
  }
}

print "\xEF\xBB\xBF\n";
foreach my $lang (sort keys %counts)
{
  print "* <tt><nowiki>$lang</nowiki></tt> \xE2\x80\x94 $counts{$lang}\n";
}

__END__