User:Jberkel/partial decompress
A short Python script to partially decompress multistream mediawiki dumps. Also see Wiktionary:Grease_pit/2015/February#multi-stream_bz2_Wiktionary_dumps.
#!/usr/bin/env python
from bz2 import BZ2Decompressor
decompressor = BZ2Decompressor()
offset = 39191316
with open("enwiktionary-20150224-pages-articles-multistream.xml.bz2") as f:
f.seek(offset, 0)
while True:
bytes = f.read(8192)
if bytes:
try:
result = decompressor.decompress(bytes)
if result:
print result
except EOFError:
break # current stream finished
else:
break # EOF