141 lines
2.7 KiB
Plaintext
141 lines
2.7 KiB
Plaintext
|
#!/usr/bin/perl -- # -*- Perl -*-
|
||
|
|
||
|
# this needs some cleanup...
|
||
|
|
||
|
my $PSTOTEXT = "pstotext";
|
||
|
|
||
|
my $pdf = shift @ARGV;
|
||
|
|
||
|
my $index = "";
|
||
|
my $inindex = 0;
|
||
|
open (F, "$PSTOTEXT $pdf |");
|
||
|
while (<F>) {
|
||
|
if (/^<\/index/) {
|
||
|
$index .= $_;
|
||
|
$inindex = 0;
|
||
|
}
|
||
|
$inindex = 1 if /^<index/;
|
||
|
|
||
|
if ($inindex) {
|
||
|
$index .= $_ if /^\s*</;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
my $cindex = "";
|
||
|
while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
|
||
|
$cindex .= $1;
|
||
|
$_ = $2;
|
||
|
$index = $'; # '
|
||
|
|
||
|
my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
|
||
|
|
||
|
# Expand ranges
|
||
|
if ($#pages >= 0) {
|
||
|
my @mpages = ();
|
||
|
foreach my $page (@pages) {
|
||
|
my $pageno = &pageno($page);
|
||
|
if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
|
||
|
for (my $count = $1; $count <= $2; $count++) {
|
||
|
push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
|
||
|
}
|
||
|
} else {
|
||
|
push (@mpages, $page);
|
||
|
}
|
||
|
}
|
||
|
@pages = sort rangesort @mpages;
|
||
|
}
|
||
|
|
||
|
# Remove duplicates...
|
||
|
if ($#pages > 0) {
|
||
|
my @mpages = ();
|
||
|
my $current = "";
|
||
|
foreach my $page (@pages) {
|
||
|
my $pageno = &pageno($page);
|
||
|
if ($pageno ne $current) {
|
||
|
push (@mpages, $page);
|
||
|
$current = $pageno;
|
||
|
}
|
||
|
}
|
||
|
@pages = @mpages;
|
||
|
}
|
||
|
|
||
|
# Collapse ranges...
|
||
|
if ($#pages > 1) {
|
||
|
my @cpages = ();
|
||
|
while (@pages) {
|
||
|
my $count = 0;
|
||
|
my $len = &rangelen($count, @pages);
|
||
|
if ($len <= 2) {
|
||
|
my $page = shift @pages;
|
||
|
push (@cpages, $page);
|
||
|
} else {
|
||
|
my $fpage = shift @pages;
|
||
|
my $lpage = "";
|
||
|
while ($len > 1) {
|
||
|
$lpage = shift @pages;
|
||
|
$len--;
|
||
|
}
|
||
|
my $fpno = &pageno($fpage);
|
||
|
my $lpno = &pageno($lpage);
|
||
|
$fpage =~ s/>$fpno</>${fpno}-$lpno</s;
|
||
|
push (@cpages, $fpage);
|
||
|
}
|
||
|
}
|
||
|
@pages = @cpages;
|
||
|
}
|
||
|
|
||
|
my $page = shift @pages;
|
||
|
$page =~ s/\s*$//s;
|
||
|
$cindex .= $page;
|
||
|
while (@pages) {
|
||
|
$page = shift @pages;
|
||
|
$page =~ s/\s*$//s;
|
||
|
$cindex .= ", $page";
|
||
|
}
|
||
|
}
|
||
|
$cindex .= $index;
|
||
|
|
||
|
print "$cindex\n";
|
||
|
|
||
|
sub pageno {
|
||
|
my $page = shift;
|
||
|
|
||
|
$page =~ s/^<phrase.*?>//;
|
||
|
$page =~ s/^<link.*?>//;
|
||
|
|
||
|
return $1 if $page =~ /^([^<>]+)/;
|
||
|
return "?";
|
||
|
}
|
||
|
|
||
|
sub rangesort {
|
||
|
my $apno = &pageno($a);
|
||
|
my $bpno = &pageno($b);
|
||
|
|
||
|
# Make sure roman pages come before arabic ones, otherwise sort them in order
|
||
|
return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
|
||
|
return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
|
||
|
return $apno cmp $bpno;
|
||
|
}
|
||
|
|
||
|
sub rangelen {
|
||
|
my $count = shift;
|
||
|
my @pages = @_;
|
||
|
my $len = 1;
|
||
|
my $inrange = 1;
|
||
|
|
||
|
my $current = &pageno($pages[$count]);
|
||
|
while ($count < $#pages && $inrange) {
|
||
|
$count++;
|
||
|
my $next = &pageno($pages[$count]);
|
||
|
if ($current + 1 eq $next) {
|
||
|
$current = $next;
|
||
|
$inrange = 1;
|
||
|
$len++;
|
||
|
} else {
|
||
|
$inrange = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $len;
|
||
|
}
|