#!/usr/bin/perl
# wiktgrep2 string
#
# scans a wiktionary xml dump file
# looks for articles containing the string arg
# dumps the <title> field, ignores other fields up to the <text> field
# then dumps all lines of the text field. scan resumes
# all lines are dumped preceded by their line number
# and that line's offset in the dump file
#
# uses two pre-built index files to speed up the
# process: a list of all article names in order, and a list of offsets
# to the start of each article in the dumpfile
use strict;
use Path::Class; # not standard!
use IO::Uncompress::Bunzip2 qw(bunzip2 $Bunzip2Error);
use HTML::Entities;
my $config = getconfig();
our($NFH) = $config->{'date'} . '-all.txt';
our($IFH) = $config->{'date'} . '-off.raw';;
my $df = Path::Class::file($config->{'dumppath'}, ('enwiktionary-' . $config->{'date'} . '-pages-articles.xml' . $config->{'ext'}));
my $regexp = shift;
print "using dump file $df\n--------\n";
my $dumph; # file handle or bzip2 object
my $mode; # 0 for text, 1 for bzip2
open NFH or die "no name file";
open IFH or die "no index file";
open(DFH, $df) or die "no dump file";
binmode(STDOUT, ":utf8"); # TODO experimental for windows console
binmode(NFH, ":utf8"); # TODO experimental for windows console
binmode(DFH, ":utf8"); # TODO experimental for windows console
binmode(IFH);
binmode(STDOUT, 'utf8'); # make optional
if (rindex($df, ".bz2") != -1) {
print STDERR "** bzip2 compressed dump **\n";
$mode = 1;
$dumph = new IO::Uncompress::Bunzip2(\*DFH) or die "IO::Uncompress::Bunzip2 failed: $Bunzip2Error\n";;
} else {
print STDERR "** uncompressed dump **\n";
$mode = 0;
$dumph = \*DFH;
}
my $lineno;
my $raw;
my $offset;
while (<NFH>) {
if (/$regexp/) {
$lineno = $. - 1;
print $_, "--------\n";
seek(IFH, $lineno * 4, 0) == 0 && die "can't seek index file";
read(IFH, $raw, 4) || die "can't read index $lineno";
$offset = unpack('I', $raw);
if ($mode == 0) {
seek(DFH, $offset, 0) == 0 && die "can't seek dump file";
} else {
$dumph->seek($offset, 0) == 0 && die "can't seek compressed dump file";
}
my $t = <$dumph>;
$t = decode_entities($t);
$t = substr($t,11,length($t)-20);
while (<$dumph>) {
last if (/<text /);
}
my $l = $_;
my $islast = 0;
while (1) {
if (index($l, ' <text') == 0) {
$l = substr($l, 33);
}
if (rindex($l, '</text>') != -1) {
$l = substr($l, 0, -8);
$islast = 1;
}
$l = decode_entities($l);
print $l;
last if $islast;
$l = <$dumph>;
}
print "\n--------\n";
}
}
exit;
##########################################
sub getconfig {
require Cwd;
require File::HomeDir; # not standard!
require FindBin;
my $configname = '.mwconfig';
my $configpath = undef;
my $dumppath = undef;
my $date = undef;
my $ext = undef;
for ( Cwd::getcwd(), File::HomeDir->my_home, $FindBin::Bin ) {
my $d = Path::Class::file($_, $configname);
if (-d $d) {
$configpath = $d;
last;
}
}
unless ($configpath) {
die "no config file in current dir, my home dir, or the script's dir";
} else {
print "using config file $configpath\n";
my $configdumppath = Path::Class::file($configpath, 'dumppath');
if (-e $configdumppath) {
#print "config contains dump path\n";
unless (open DP, "<$configdumppath") {
print "can't open configdumppath\n";
} else {
#print "opened configdumppath\n";
$dumppath = <DP>;
chomp $dumppath;
#print "dump files are stored in $dumppath\n";
unless (-d $dumppath) {
print "specified dump file directory $dumppath doesn't exist\n";
} else {
my $origdir = Cwd::getcwd();
chdir $dumppath;
# find newest .xml and .xml.bz2 dump files
my @files = glob('enwiktionary-????????-pages-articles.xml');
push @files, glob('enwiktionary-????????-pages-articles.xml.bz2');
if (@files) {
my @sorted = sort { $b->[0] <=> $a->[0] || $a->[1] cmp $b->[1] }
map { /enwiktionary-(\d\d\d\d\d\d\d\d)-pages-articles.xml(\.bz2)?/; [ $1, $2 ] }
@files;
$date = $sorted[0][0];
$ext = $sorted[0][1];
}
chdir $origdir;
}
}
} else {
print "config doesn't contain dump path\n";
}
}
return { 'dumppath' => $dumppath, 'date' => $date, 'ext' => $ext };
}