#!/usr/bin/perl use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); use Lingua::EN::Fathom; use TeX::Hyphen; use List::Util qw(min); my $filename = shift or die "Give me a filename"; my $hyp = new TeX::Hyphen(); # file => 'hyphen.tex', # style => 'czech', # leftmin => 2, # rightmin => 2, # my @points = $hyp->hyphenate($word); my $f = new Lingua::EN::Fathom; # First print out a report on the whole file. $f->analyse_file($filename); print($f->report); =pod Number of characters : 65102 Number of words : 10707 Percent of complex words : 15.36 Average syllables per word : 1.6678 Number of sentences : 579 Average words per sentence : 18.4922 Number of text lines : 246 Number of blank lines : 248 Number of paragraphs : 246 READABILITY INDICES Fog : 13.5424 Flesch : 46.9706 Flesch-Kincaid : 11.3019 =cut =pod # Print out the longest words and their frequency. my %words = $f->unique_words; my @longest = reverse sort { length($a) * $words{$a} <=> length($b) * $words{$b} } keys %words; @longest = grep { length($_) > 4 } @longest; foreach my $i ( 0 .. min(30, $#longest ) ) { printf("%-30s %5d\n", $longest[$i], $words{$longest[$i]}); } =cut # Now find the longest and least readable paragraphs, and the sentences with the # most syllables. These metrics are a 1 if more is better, and a 0 else. my %metrics = ( num_chars => 0, num_words => 0, percent_complex_words => 0, num_sentences => 0, syllables_per_word => 0, words_per_sentence => 0, fog => 0, flesch => 1, kincaid => 0, ); my @stats; my %syll_for; open my $fi, "<", $filename or die $OS_ERROR; while ( my $line = <$fi> ) { chomp $line; next unless $line; my $st = { line => $line }; $f->analyse_block($line); foreach my $metric ( keys %metrics ) { $st->{$metric} = $f->$metric(); } # My own metric: syllables per paragraph and syllables per sentence. $st->{tex_syllables} = 0; foreach my $sentence ( split(/[.!?]\s+(?=[A-Z])/, $line ) ) { $sentence =~ s/[^\w\s]+//g; foreach my $word ( split(/\s+/, $sentence ) ) { my $hy = $hyp->hyphenate($word); $syll_for{$sentence} += $hy; $st->{tex_syllables} += $hy; } } push @stats, $st; } close $fi; my %worst_overall; foreach my $metric ( qw(tex_syllables), keys %metrics ) { my @top = sort { $a->{$metric} <=> $b->{$metric} } @stats; if ( !$metrics{$metric} ) { @top = reverse @top; } if ( $metric eq 'flesch' ) { @top = grep { $_->{flesch} != 0 } @top; } print "====================== $metric\n"; foreach my $i ( 0 .. min(30, $#top) ) { my $st = $top[$i]; printf("%6.4f %s\n", $st->{$metric}, substr($st->{line}, 0, 70)); $worst_overall{$st->{line}}++; } print "\n"; } # Syllables-per-sentence report. my @top = reverse sort { $syll_for{$a} <=> $syll_for{$b} } keys %syll_for; print "====================== syllables per sentence\n"; foreach my $i ( 0 .. min(30, $#top) ) { my $sent = $top[$i]; print $syll_for{$sent}, ' ', $sent, "\n"; } # Lines that showed up as bad in most of the Fathom reports. @top = reverse sort { $worst_overall{$a} <=> $worst_overall{$b} } keys %worst_overall; print "\n====================== worst overall\n"; foreach my $i ( 0 .. min(30, $#top) ) { my $sent = $top[$i]; print $worst_overall{$sent}, ' ', substr($sent, 0, 70), "\n"; }