percona-toolkit/util/parse-aspell-output

#!/usr/bin/env perl

# This script parses the output of ispell/aspell --pipe into something
# meaningful.  The output is like:
#
#   @(#) International Ispell Version 3.1.20 (but really Aspell 0.60.6)
#   & mk 50 0: Mk, km, mks, ml, K, M, k, ...
#   *
#   *
#   *
#   *
#
#   *
#   *
#   & mk 50 6: Mk, km, mks, ml, K, M, k, ...
#
# Spelling errors are the "& WORD COUNT OFFSET: SUGGESTIONS" lines.  We
# don't care about COUNT and OFFSET is per-word (or so it seems), so it's
# not helpful to us either.  We also don't care about the suggestions.
#
# What we care about is on which line the bad WORD appears.  Lines are
# separated by blank lines in the output; so that output reflects 2 lines
# in the input.  The asterisk lines are good/spelled correctly words.

use strict;
use warnings FATAL => 'all';

use English qw(-no_match_vars);
use Data::Dumper;

my ($ispell_output, $pod_text) = @ARGV;
die "No ispell output file given" unless $ispell_output && -f $ispell_output;
die "No POD text file given" unless $pod_text && -f $pod_text;

my $fh;

open $fh, '<', $pod_text or die "Cannot open $pod_text: $OS_ERROR";
my @pod = <$fh>;
close $fh;

open $fh, '<', $ispell_output or die "Cannot open $ispell_output: $OS_ERROR";

my $pod_lineno = 1;
my $i = 0;
my $j = 0;
LINE:
while ( defined(my $line = <$fh>) ) {
   if ( $line =~ m/^\s*$/ ) {
      $pod_lineno++;
      next LINE;
   }

   my ($word, $correct) = $line =~ m/^& (\w+) \d+ \d+: (.+)/;
   next LINE unless $word;

   next LINE if $word eq 'mk';

   if ( $i < $pod_lineno ) {
      for my $pod_line ( $j..$#pod ) {
         $i++ if $pod[$j++] ne "\n";
         last if $i == $pod_lineno;
      }
   }

   my $pod_line = $pod[$j - 1];

   next LINE if $pod_line =~ m/^\s*(?:type|short form): [\w-]+/;

   next LINE if $word =~ m/utf/i && $pod_line =~ m/utf8/i;

   next LINE if $pod_line =~ m/^\s+--$word$/;

   next LINE if $word eq 'maatkit'  && $pod_line =~ m/maatkit manpage/;
   next LINE if $word eq 'maatkit'  && $pod_line =~ m{http://code.google.com/p/maatkit/};
   next LINE if $word eq 'dsn'      && $pod_line =~ m/dsn: \w+/;
   next LINE if $word eq 'tmp'      && $pod_line =~ m/tmp table/;
   next LINE if $word eq 'toolname' && $pod_line =~ m/Where "toolname"/;

   $pod_line =~ s/^\s+//;
   my @correct = map { s/^s+//g; s/\s+$//g; $_ } split(',', $correct);

   print "  Misspelled: $word\n"
      .  " Suggestions: " . join(', ',
                  grep { defined $_ } map { $correct[$_] } (0..2)) . "\n"
      .  "        Line: $pod_line\n";
}

close $fh;
exit;