#!/usr/bin/perl # # A script that extracts person names from the lists # "Geboren" and "Gestorben" in German Wikipedia articles # on calendar years. # # This is merely a dirty hack. It works as follows: # 1. Download all years from 1000-now (http://de.wikipedia.org/wiki/1000) # using wget # 2. Use find -exec and feed each single file into this very script. # piping the output of the thing to a file. # 3. Use tool such as sort and uniq to clean up the outcome. # # # Licensing: # # extract-persons.pl # Copyright 2008-2010 Niels Ott # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # # use strict; use warnings; # slurp input of single ifle my $bak = $/; $/ = undef; my $input = <>; $/ = $bak; #print $input; my $start = quotemeta('

'); my $start2 = quotemeta('

'); $input =~ m/$start(.*)/s; my $rest = $1; if (! defined($rest) ) { $input =~ m/$start2(.*)/s; $rest = $1; } # einträge finden while ($rest =~ m/
  • .*?<\/a>(.*?)<\/li>/gs ) { my $entry = $2; # zeugs in klammern eliminieren $entry =~ s/\(.+?\)//g; #print "$entry\n\n"; # links in einträgen raussuchen while ($entry =~ m/([^<]+)<\/a>/gs) { my $url = $1; my $person = $2; print "\"$person\"\t\"$url\"\n"; } }