#!/usr/bin/perl =head1 SUMMARY utf8lint - check for common UTF-8 encoding/decoding bugs in data =head1 SYNOPSIS Currently the only failure checked for is double-encoding, where a string that is already UTF-8 has been treated as another charset (typically ISO Latin-1) and re-encoded as UTF-8, resulting in double encoding. Jun 9 2005 jm =cut use strict; use Encode; binmode(STDIN, ":bytes"); my $linenum = 0; while () { $linenum++; next unless (/[\x80-\xff]/); # no high bits? we can't tell my ($dec1, $dec2); eval { # trap decode failures $dec1 = Encode::decode_utf8($_, Encode::FB_CROAK); $dec2 = Encode::decode_utf8($dec1, Encode::FB_CROAK); }; if (defined $dec2) { print "line $linenum: possible double-encoded UTF-8: $_"; } }