User:Flubot/getcat
Tool name: getcat.sh
Original script by Ariel Glenn
Description:
This tool downloads the entries' titles of a certain category and stores them to "cat_tmp/titles.<date>.txt".
Parameters:
- the name of the category. Give for example
./getcat.sh "French verb forms"
You need:
- a unix/linux environment and the date, sed, awk, cat, grep commands
- the curl command
→ Back to Flubot |
getcat.sh
edit#!/bin/bash usage() { echo "Usage: $0 cat" echo "where category is the name of the category for which to retrieve titles" echo echo "For example:" echo "$0 'French verbs'\n"; exit 1 } if [ -z "$1" ]; then usage fi cat=`echo "$1" | sed -e 's/ /_/g;'` cat="Category:$cat" tmp="./cat_tmp" today=`date +"%B-%e-%Y"` ext="$today" mkdir -p $tmp titles="$tmp/titles.$ext" cmcontinue="" rm -f $titles.* count=1 while [ 1 ]; do echo getting category titles $count to $count+500 # επόμενοι 500 if [ -z "$cmcontinue" ]; then curl --retry 10 -f "http://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=$cat&cmprop=title&cmlimit=500&format=xml" | sed -e 's/>/>\n/g;' > $titles.xml.temp else curl --retry 10 -f "http://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=$cat&cmprop=title&cmcontinue=$cmcontinue&cmlimit=500&format=xml" | sed -e 's/>/>\n/g;' > $titles.xml.temp fi if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get xml pages, bailing" exit 1 fi cat $titles.xml.temp >> $titles.xml # get continue param # format: <categorymembers cmcontinue="Esthonia|" /> cmcontinue=`grep cmcontinue $titles.xml.temp` if [ -z "$cmcontinue" ]; then break; else cmcontinue=`echo $cmcontinue | awk -F'"' '{ print $2 }'` fi sleep 6 count=$(( $count+500 )) done cat $titles.xml | grep '<cm ' | awk -F'"' '{ print $4 }' | sed -e 's/^/[[/g; s/$/]]/g;' > $titles.txt echo "done!" exit 0