#!/usr/bin/perl

=head1 SUMMARY

utf8lint - check for common UTF-8 encoding/decoding bugs in data

=head1 SYNOPSIS

Currently the only failure checked for is double-encoding, where
a string that is already UTF-8 has been treated as another
charset (typically ISO Latin-1) and re-encoded as UTF-8, resulting
in double encoding.

Jun  9 2005 jm

=cut

use strict;
use Encode;
binmode(STDIN, ":bytes");

my $linenum = 0;

while (<STDIN>) {
  $linenum++;
  next unless (/[\x80-\xff]/);      # no high bits?  we can't tell

  my ($dec1, $dec2);
  eval {        # trap decode failures
    $dec1 = Encode::decode_utf8($_,    Encode::FB_CROAK);
    $dec2 = Encode::decode_utf8($dec1, Encode::FB_CROAK);
  };

  if (defined $dec2) {
    print "line $linenum: possible double-encoded UTF-8: $_";
  }
}