User:Hippietrail/wiktgrep.pl

#!/usr/bin/perl

# wiktgrep2 string
#
# scans a wiktionary xml dump file
# looks for articles containing the string arg
# dumps the <title> field, ignores other fields up to the <text> field
# then dumps all lines of the text field. scan resumes
# all lines are dumped preceded by their line number
# and that line's offset in the dump file
#
# uses two pre-built index files to speed up the
# process: a list of all article names in order, and a list of offsets
# to the start of each article in the dumpfile

use strict;
use Path::Class;    # not standard!
use IO::Uncompress::Bunzip2 qw(bunzip2 $Bunzip2Error);
use HTML::Entities;

my $config = getconfig();

our($NFH) = $config->{'date'} . '-all.txt';
our($IFH) = $config->{'date'} . '-off.raw';;
my $df = Path::Class::file($config->{'dumppath'}, ('enwiktionary-' . $config->{'date'} . '-pages-articles.xml' . $config->{'ext'}));
my $regexp = shift;

print "using dump file $df\n--------\n";

my $dumph;  # file handle or bzip2 object
my $mode;   # 0 for text, 1 for bzip2

open NFH or die "no name file";
open IFH or die "no index file";
open(DFH, $df) or die "no dump file";

binmode(STDOUT, ":utf8");	# TODO experimental for windows console
binmode(NFH, ":utf8");      # TODO experimental for windows console
binmode(DFH, ":utf8");      # TODO experimental for windows console

binmode(IFH);
binmode(STDOUT, 'utf8');    # make optional

if (rindex($df, ".bz2") != -1) {
    print STDERR "** bzip2 compressed dump **\n";
    $mode = 1;
    $dumph = new IO::Uncompress::Bunzip2(\*DFH) or die "IO::Uncompress::Bunzip2 failed: $Bunzip2Error\n";;
} else {
    print STDERR "** uncompressed dump **\n";
    $mode = 0;
    $dumph = \*DFH;
}

my $lineno;
my $raw;
my $offset;

while (<NFH>) {
    if (/$regexp/) {
        $lineno = $. - 1;
        print $_, "--------\n";
        seek(IFH, $lineno * 4, 0) == 0 && die "can't seek index file";

        read(IFH, $raw, 4) || die "can't read index $lineno";
        $offset = unpack('I', $raw);

        if ($mode == 0) {
            seek(DFH, $offset, 0) == 0 && die "can't seek dump file";
        } else {
            $dumph->seek($offset, 0) == 0 && die "can't seek compressed dump file";
        }

        my $t = <$dumph>;
        $t = decode_entities($t);
        $t = substr($t,11,length($t)-20);
        
		while (<$dumph>) {
			last if (/<text /);
		}

        my $l = $_;
        my $islast = 0;
        while (1) {
            if (index($l, '      <text') == 0) {
                $l = substr($l, 33);
            }
            if (rindex($l, '</text>') != -1) {
                $l = substr($l, 0, -8);
                $islast = 1;
            }
            $l = decode_entities($l);
            print $l;
            last if $islast;
            $l = <$dumph>;
        }

		print "\n--------\n";
    }
}

exit;

##########################################

sub getconfig {
    require Cwd;
    require File::HomeDir;  # not standard!
    require FindBin;

    my $configname = '.mwconfig';
    my $configpath = undef;
    my $dumppath = undef;
    my $date = undef;
    my $ext = undef;

    for ( Cwd::getcwd(), File::HomeDir->my_home, $FindBin::Bin ) {
        my $d = Path::Class::file($_, $configname);
        if (-d $d) {
            $configpath = $d;
            last;
        }
    }

    unless ($configpath) {
        die "no config file in current dir, my home dir, or the script's dir";
    } else {
        print "using config file $configpath\n";

        my $configdumppath = Path::Class::file($configpath, 'dumppath');

        if (-e $configdumppath) {
            #print "config contains dump path\n";
            unless (open DP, "<$configdumppath") {
                print "can't open configdumppath\n";
            } else {
                #print "opened configdumppath\n";
                $dumppath = <DP>;
                chomp $dumppath;
                #print "dump files are stored in $dumppath\n";
                unless (-d $dumppath) {
                    print "specified dump file directory $dumppath doesn't exist\n";
                } else {
                    my $origdir = Cwd::getcwd();
                    chdir $dumppath;

                    # find newest .xml and .xml.bz2 dump files

                    my @files = glob('enwiktionary-????????-pages-articles.xml');
                    push @files, glob('enwiktionary-????????-pages-articles.xml.bz2');

                    if (@files) {
                        my @sorted = sort { $b->[0] <=> $a->[0] || $a->[1] cmp $b->[1] }
                            map { /enwiktionary-(\d\d\d\d\d\d\d\d)-pages-articles.xml(\.bz2)?/; [ $1, $2 ]  }
                                @files;

                        $date = $sorted[0][0];
                        $ext = $sorted[0][1];
                    }

                    chdir $origdir;
                }
            }
        } else {
            print "config doesn't contain dump path\n";
        }
    }
    return { 'dumppath' => $dumppath, 'date' => $date, 'ext' => $ext };
}