way4thesub
unread,Jan 6, 2009, 6:16:45 PM1/6/09Sign in to reply to author
Sign in to forward
You do not have permission to delete messages in this group
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to php-text-statistics
Hello all,
I've translated the php-text-statistics package to Ruby, you can view
the files below. Please note I couldn't get the Gunning Fog Score to
work 100%
Regards,
Adam
############### Code
require 'collections/sequenced_hash'
module ReadabilityIndices
class Readability
NumDecimalPlaces = 1
Titles = SequencedHash.new
Titles[:flesch_kincaid_grade_level] = 'Flesch-Kincaid Grade level'
Titles[:flesch_kincaid_reading_ease] = 'Flesch-Kincaid Reading
Ease'
Titles[:gunning_fog_score] = 'Gunning-Fog score'
Titles[:coleman_liau_index] = 'Coleman-Liau Index'
Titles[:smog_index] = 'SMOG Index'
Titles[:automated_readability_index] = 'Automated Readability
Index'
attr_accessor :text
def initialize(text = '')
self.text = clean_text(text)
end
def valid_index?(index)
Titles[index] ? true : false
end
def flesch_kincaid_grade_level
round(0.39 * average_words_per_sentence + 11.8 *
average_syllables_per_word - 15.59, NumDecimalPlaces)
end
def flesch_kincaid_reading_ease
round(206.835 - 1.015 * average_words_per_sentence - 84.6 *
average_syllables_per_word, NumDecimalPlaces)
end
def gunning_fog_score
round((average_words_per_sentence +
percentage_words_with_three_syllables(false)) * 0.4, NumDecimalPlaces)
end
def coleman_liau_index
round(5.89 * letter_count / word_count - 0.3 * sentence_count /
word_count - 15.8, NumDecimalPlaces)
end
def smog_index
round(1.043 * Math.sqrt((words_with_three_syllables * (30 /
sentence_count)) + 3.1291), NumDecimalPlaces)
end
def automated_readability_index
round(4.71 * letter_count / word_count + 0.5 * word_count /
sentence_count - 21.43, NumDecimalPlaces)
end
Colon = ": "
Separator = ", "
def get_indices_as_string(indices = [], diagnostics = true)
indices = (indices.empty? ? Titles.keys : indices)
str = indices.inject([]){|arr, index| arr << "#{Titles[index]}#
{Colon} #{self.send(index)}"; arr}.join(Separator)
return diagnostics ? "words#{Colon} #{word_count}#{Separator}
sentences#{Colon} #{sentence_count}#{Separator} characters#{Colon} #
{letter_count}#{Separator}" + str : str
end
# private
def clean_text(text)
text.gsub!(/[,:;()-]/, ' ') # Replace commans, hyphens etc
(count them as spaces)
text.gsub!(/[\.!?]/, '.') # Unify terminators
text = text.strip + '.' # Add final terminator, just in case
it's missing.
text.gsub!(/[ ]*(\n|\r\n|\r)[ ]*/, ' ') # Replace new lines with
spaces
text.gsub!(/([\.])[\.\s?]+/, ".") # Check for duplicated
terminators
text.gsub!(/[ ]*([\.])/, "#{$1} ") # Pad sentence terminators
text.gsub!(/[ ]+/, ' ') # Remove multiple spaces
#$strText = preg_replace_callback('/\. [^ ]+/', create_function
('$matches', 'return strtolower($matches[0]);'), $strText); // Lower
case all words following terminators (for gunning fog score)
return text.strip
end
def round(num, decimals)
(num * 10 * decimals).round / (10 * decimals).to_f
end
def letter_count
self.text.gsub(/[^A-Za-z]+/, '').length.to_i
end
def sentence_count
[1, self.text.split(/\.!?/).length].max
end
def word_count
get_words.length
end
def get_words
@words ||= self.text.split(/\s+/)
end
def average_words_per_sentence
word_count / sentence_count.to_f
end
def average_syllables_per_word
total_syllables / get_words.length.to_f
end
def total_syllables
get_words.inject(0){|sum, word| sum + syllable_count(word)}
end
def words_with_three_syllables(count_proper_nouns = true)
get_words.inject(0) do |sum, word|
if syllable_count(word) >= 3
if count_proper_nouns
sum += 1
else
sum += 1 if word[0..0] == word[0..0].downcase
end
end
sum
end
end
def percentage_words_with_three_syllables(count_proper_nouns =
true)
words_with_three_syllables(count_proper_nouns) / word_count.to_f
* 100
end
ProblemWords = {
'simile' => 3,
'forever' => 3,
'shoreline' => 2
}
MultiSyllablesThatAreOne = [
/cial/,
/tia/,
/cius/,
/cious/,
/giu/,
/ion/,
/iou/,
/sia$/,
/[^aeiuoyt]{2,}ed$/,
/.ely$/,
/[cg]h?e[rsd]?$/,
/rved?$/,
/[aeiouy][dt]es?$/,
/[aeiouy][^aeiouydt]e[rsd]?$/,
/^[dr]e[aeiou][^aeiou]+$/, #Sorts out deal, deign etc
/[aeiouy]rse$/ #Purse, hears
]
UniSyllablesThatAreTwo = [
/ia/,
/riet/,
/dien/,
/iu/,
/io/,
/ii/,
/[aeiouym]bl$/,
/[aeiou]{3}/,
/^mc/,
/ism$/,
/([^aeiouy])\1l$/,
/[^l]lien/,
/^coa[dglx]./,
/[^gq]ua[^auieo]/,
/dnt$/,
/uity$/,
/ie(r|st)$/
]
PrefixesAndSuffixes = [
/^un/,
/^fore/,
/ly$/,
/less$/,
/ful$/,
/ers?$/,
/ings?$/
]
def syllable_count(word)
word = word.downcase.strip
#handle problem words first
return ProblemWords[word] if ProblemWords[word]
#find number and delete prefixes and suffixes
num_syllables = PrefixesAndSuffixes.inject(0) do |sum, prefix|
word.scan(prefix){sum += 1}
word.gsub!(prefix, '')
sum
end
#remove non-word chars
word.gsub!(/[^a-z]/is, '')
#count word parts:
num_syllables += word.split(/[^aeiouy]+/).inject(0){|sum,
word_part| sum + (word_part.blank? ? 0 : 1)}
#subtract out syllables that are really one:
MultiSyllablesThatAreOne.each{|syl| word.scan(syl){num_syllables
-= 1}}
#add syllables that are really two:
UniSyllablesThatAreTwo.each{|syl| word.scan(syl){num_syllables
+= 1}}
return [1, num_syllables].max
end
end
end
############### RSpec tests
include ReadabilityIndices
describe "readability indices" do
before(:each) do
@readability_blank = Readability.new
end
it "should count simple syllable words correctly" do
@readability_blank.syllable_count('a').should == 1
@readability_blank.syllable_count('was').should == 1
@readability_blank.syllable_count('the').should == 1
@readability_blank.syllable_count('and').should == 1
@readability_blank.syllable_count('foobar').should == 2
@readability_blank.syllable_count('hello').should == 2
@readability_blank.syllable_count('world').should == 1
@readability_blank.syllable_count('wonderful').should == 3
@readability_blank.syllable_count('simple').should == 2
@readability_blank.syllable_count('easy').should == 2
@readability_blank.syllable_count('hard').should == 1
@readability_blank.syllable_count('quick').should == 1
@readability_blank.syllable_count('brown').should == 1
@readability_blank.syllable_count('fox').should == 1
@readability_blank.syllable_count('jumped').should == 1
@readability_blank.syllable_count('over').should == 2
@readability_blank.syllable_count('lazy').should == 2
@readability_blank.syllable_count('dog').should == 1
@readability_blank.syllable_count('camera').should == 3
end
it "should count syllables on programmed exceptions" do
@readability_blank.syllable_count('simile').should == 3
@readability_blank.syllable_count('shoreline').should == 2
@readability_blank.syllable_count('forever').should == 3
end
it "should count complex syllable words correctly" do
@readability_blank.syllable_count
('antidisestablishmentarianism').should == 12
@readability_blank.syllable_count
('supercalifragilisticexpialidocious').should == 14
@readability_blank.syllable_count
('chlorofluorocarbonation').should == 8
@readability_blank.syllable_count('forethoughtfulness').should
== 4
@readability_blank.syllable_count('phosphorescent').should == 4
@readability_blank.syllable_count('theoretician').should == 5
@readability_blank.syllable_count('promiscuity').should == 5
@readability_blank.syllable_count('unbutlering').should == 4
@readability_blank.syllable_count('continuity').should == 5
@readability_blank.syllable_count('craunched').should == 1
@readability_blank.syllable_count('squelched').should == 1
@readability_blank.syllable_count('scrounge').should == 1
@readability_blank.syllable_count('coughed').should == 1
@readability_blank.syllable_count('smile').should == 1
@readability_blank.syllable_count('monopoly').should == 4
@readability_blank.syllable_count('doughey').should == 2
@readability_blank.syllable_count('doughier').should == 3
@readability_blank.syllable_count('leguminous').should == 4
@readability_blank.syllable_count('thoroughbreds').should == 3
@readability_blank.syllable_count('special').should == 2
@readability_blank.syllable_count('delicious').should == 3
@readability_blank.syllable_count('spatial').should == 2
@readability_blank.syllable_count('pacifism').should == 4
@readability_blank.syllable_count('coagulant').should == 4
@readability_blank.syllable_count('shouldn\'t').should == 2
@readability_blank.syllable_count('mcdonald').should == 3
@readability_blank.syllable_count('audience').should == 3
@readability_blank.syllable_count('finance').should == 2
@readability_blank.syllable_count('prevalence').should == 3
@readability_blank.syllable_count('impropriety').should == 5
@readability_blank.syllable_count('alien').should == 3
@readability_blank.syllable_count('dreadnought').should == 2
@readability_blank.syllable_count('verandah').should == 3
@readability_blank.syllable_count('similar').should == 3
@readability_blank.syllable_count('similarly').should == 4
@readability_blank.syllable_count('central').should == 2
@readability_blank.syllable_count('cyst').should == 1
@readability_blank.syllable_count('term').should == 1
@readability_blank.syllable_count('order').should == 2
@readability_blank.syllable_count('fur').should == 1
@readability_blank.syllable_count('sugar').should == 2
@readability_blank.syllable_count('paper').should == 2
@readability_blank.syllable_count('make').should == 1
@readability_blank.syllable_count('gem').should == 1
@readability_blank.syllable_count('program').should == 2
@readability_blank.syllable_count('hopeless').should == 2
@readability_blank.syllable_count('hopelessly').should == 3
@readability_blank.syllable_count('careful').should == 2
@readability_blank.syllable_count('carefully').should == 3
@readability_blank.syllable_count('stuffy').should == 2
@readability_blank.syllable_count('thistle').should == 2
@readability_blank.syllable_count('teacher').should == 2
@readability_blank.syllable_count('unhappy').should == 3
@readability_blank.syllable_count('ambiguity').should == 5
@readability_blank.syllable_count('validity').should == 4
@readability_blank.syllable_count('ambiguous').should == 4
@readability_blank.syllable_count('deserve').should == 2
@readability_blank.syllable_count('blooper').should == 2
@readability_blank.syllable_count('scooped').should == 1
@readability_blank.syllable_count('deserve').should == 2
@readability_blank.syllable_count('deal').should == 1
@readability_blank.syllable_count('death').should == 1
@readability_blank.syllable_count('dearth').should == 1
@readability_blank.syllable_count('deign').should == 1
@readability_blank.syllable_count('reign').should == 1
@readability_blank.syllable_count('bedsore').should == 2
@readability_blank.syllable_count('anorexia').should == 5
@readability_blank.syllable_count('anymore').should == 3
@readability_blank.syllable_count('cored').should == 1
@readability_blank.syllable_count('sore').should == 1
@readability_blank.syllable_count('foremost').should == 2
@readability_blank.syllable_count('restore').should == 2
@readability_blank.syllable_count('minute').should == 2
@readability_blank.syllable_count('manticores').should == 3
@readability_blank.syllable_count('asparagus').should == 4
@readability_blank.syllable_count('unexplored').should == 3
@readability_blank.syllable_count('unexploded').should == 4
@readability_blank.syllable_count('CAPITALS').should == 3
end
it "should calculate average syllables per word" do
Readability.new('and then there was
one').average_syllables_per_word.should == 1
Readability.new('because special ducklings deserve
rainbows').average_syllables_per_word.should == 2
Readability.new('and then there was one because special
ducklings deserve rainbows').average_syllables_per_word.should ==
1.5
end
it "should count words correctly" do
Readability.new('The quick brown fox jumped over the lazy
dog').word_count.should == 9
Readability.new('The quick brown fox jumped over the lazy
dog.').word_count.should == 9
Readability.new('The quick brown fox jumped over the lazy dog.
').word_count.should == 9
Readability.new(' The quick brown fox jumped over the lazy dog.
').word_count.should == 9
Readability.new(' The quick brown fox jumped over the lazy dog.
').word_count.should == 9
Readability.new('Yes. No.').word_count.should == 2
Readability.new('Yes.No.').word_count.should == 2
Readability.new('Yes.No.').word_count.should == 2
Readability.new('Yes . No.').word_count.should == 2
Readability.new('Yes - No. ').word_count.should == 2
end
it "should get percentage of words with three syllables" do
Readability.new('there is just one word with three syllables in
this sentence').percentage_words_with_three_syllables.round.should ==
9
Readability.new('there are no valid words with three Syllables
in this sentence').percentage_words_with_three_syllables.round.should
== 9
Readability.new('there is one and only one word with three or
more syllables in this long boring sentence of twenty
words').percentage_words_with_three_syllables.round.should == 5
Readability.new('there are two and only two words with three or
more syllables in this long sentence of exactly twenty
words').percentage_words_with_three_syllables.round.should == 10
Readability.new('there is Actually only one valid word with
three or more syllables in this long sentence of Exactly twenty
words').percentage_words_with_three_syllables(false).round.should == 5
Readability.new('no long words in this
sentence').percentage_words_with_three_syllables.round.should == 0
Readability.new('no long valid words in this sentence because
the test ignores proper case words like this
Behemoth').percentage_words_with_three_syllables(false).round.should
== 0
end
it "should count letters" do
Readability.new('a').letter_count.should == 1
Readability.new('').letter_count.should == 0
Readability.new('this sentence has 30 characters, not including
the digits').letter_count.should == 46
end
it "should count sentences" do
Readability.new('This is a sentence').sentence_count.should == 1
Readability.new('This is a sentence.').sentence_count.should ==
1
Readability.new('This is a sentence!').sentence_count.should ==
1
Readability.new('This is a sentence?').sentence_count.should ==
1
Readability.new('This is a sentence..').sentence_count.should ==
1
Readability.new('This is a sentence. So is
this.').sentence_count.should == 2
Readability.new("This is a sentence. \n\n So is this, but this
is multi-line!").sentence_count.should == 2
Readability.new('This is a sentence,. So is
this.').sentence_count.should == 2
Readability.new('This is a sentence!? So is
this.').sentence_count.should == 2
Readability.new('This is a sentence. So is this. And this one as
well.').sentence_count.should == 3
Readability.new('This is a sentence - but just
one.').sentence_count.should == 1
Readability.new('This is a sentence (but just
one).').sentence_count.should == 1
end
it "should calculate average words per sentence" do
Readability.new('This is a
sentence').average_words_per_sentence.should == 4
Readability.new('This is a
sentence.').average_words_per_sentence.should == 4
Readability.new('This is a sentence.
').average_words_per_sentence.should == 4
Readability.new('This is a sentence. This is a
sentence').average_words_per_sentence.should == 4
Readability.new('This is a sentence. This is a
sentence.').average_words_per_sentence.should == 4
Readability.new('This, is - a sentence . This is a sentence.
').average_words_per_sentence.should == 4
Readability.new('This is a sentence with extra text. This is a
sentence. ').average_words_per_sentence.should == 5.5
Readability.new('This is a sentence with some extra text. This
is a sentence. ').average_words_per_sentence.should == 6
end
describe "test indices directly" do
before(:each) do
@str_a = 'This. Is. A. Nice. Set. Of. Small. Words. Of. One.
Part. Each.'
@str_b = 'The quick brown fox jumped over the lazy dog.'
@str_c = 'The quick brown fox jumped over the lazy dog. The
quick brown fox jumped over the lazy dog.'
@str_d = "The quick brown fox jumped over the lazy dog. \n\n
The quick brown fox jumped over the lazy dog."
@str_e = 'The quick brown fox jumped over the lazy dog. The
quick brown fox jumped over the lazy dog'
@str_f = 'Now it is time for a more complicated sentence,
including several longer words.'
@str_g = 'Now it is time for a more Complicated sentence,
including Several longer words.'
end
it "should calculate flesch-kincaid reading ease" do
Readability.new(@str_a).flesch_kincaid_reading_ease.should ==
121.2
Readability.new(@str_b).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_c).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_d).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_e).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_f).flesch_kincaid_reading_ease.should ==
50.5
end
it "should calculate flesch-kincaid grade level" do
Readability.new(@str_a).flesch_kincaid_grade_level.should ==
-3.4
Readability.new(@str_b).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_c).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_d).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_e).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_f).flesch_kincaid_grade_level.should ==
9.4
end
it "should calculate Gunning-Fog Score" do
Readability.new(@str_a).gunning_fog_score.should == 0.4
Readability.new(@str_b).gunning_fog_score.should == 3.6
Readability.new(@str_c).gunning_fog_score.should == 3.6
Readability.new(@str_d).gunning_fog_score.should == 3.6
Readability.new(@str_e).gunning_fog_score.should == 3.6
Readability.new(@str_f).gunning_fog_score.should == 14.4
Readability.new(@str_g).gunning_fog_score.should == 8.3
end
it "should calculate coleman-liau index" do
Readability.new(@str_a).coleman_liau_index.should == 3.0
Readability.new(@str_b).coleman_liau_index.should == 7.7
Readability.new(@str_c).coleman_liau_index.should == 7.7
Readability.new(@str_d).coleman_liau_index.should == 7.7
Readability.new(@str_e).coleman_liau_index.should == 7.7
Readability.new(@str_f).coleman_liau_index.should ==
13.6
end
it "should calculate smog index" do
Readability.new(@str_a).smog_index.should == 1.8
Readability.new(@str_b).smog_index.should == 1.8
Readability.new(@str_c).smog_index.should == 1.8
Readability.new(@str_d).smog_index.should == 1.8
Readability.new(@str_e).smog_index.should == 1.8
Readability.new(@str_f).smog_index.should ==
10.1
end
it "should calculate automated readability index" do
Readability.new(@str_a).automated_readability_index.should ==
-5.6
Readability.new(@str_b).automated_readability_index.should ==
1.9
Readability.new(@str_c).automated_readability_index.should ==
1.9
Readability.new(@str_d).automated_readability_index.should ==
1.9
Readability.new(@str_e).automated_readability_index.should ==
1.9
Readability.new(@str_f).automated_readability_index.should ==
8.6
end
it "should index first paragraph of Moby Dick correctly" do
str =<<-ENDL
Call me Ishmael. Some years ago - never mind how long
precisely - having little or no money in my purse, and
nothing particular to interest me on shore, I thought I
would sail about a little and see the watery part of
the world. It is a way I have of driving off the spleen, and
regulating the circulation. Whenever I find myself
growing grim about the mouth; whenever it is a damp, drizzly
November in my soul; whenever I find myself
involuntarily pausing before coffin warehouses, and bringing
up the rear of every funeral I meet; and especially
whenever my hypos get such an upper hand of me, that it
requires a strong moral principle to prevent me from
deliberately stepping into the street, and methodically
knocking people's hats off - then, I account it high time
to get to sea as soon as I can. This is my substitute for
pistol and ball. With a philosophical flourish Cato
throws himself upon his sword; I quietly take to the ship.
There is nothing surprising in this. If they but knew
it, almost all men in their degree, some time or other,
cherish very nearly the same feelings towards the ocean with me.
ENDL
readability = Readability.new(str)
readability.letter_count.should == 884
readability.word_count.should == 201
readability.total_syllables.should == 304
readability.sentence_count.should == 8
readability.words_with_three_syllables.should == 23
readability.flesch_kincaid_grade_level.should == 12.1
readability.flesch_kincaid_reading_ease.should == 53.4
readability.gunning_fog_score.should == 14.2
readability.coleman_liau_index.should == 10.1
readability.smog_index.should == 8.9
readability.automated_readability_index.should == 11.8
end
it "should index a Kipling poem correctly" do
str =<<-ENDL
If you can keep your head when all about you
Are losing theirs and blaming it on you,
If you can trust yourself when all men doubt you
But make allowance for their doubting too,
If you can wait and not be tired by waiting,
Or being lied about, don't deal in lies,
Or being hated, don't give way to hating,
And yet don't look too good, nor talk too wise:
If you can dream - and not make dreams your master,
If you can think - and not make thoughts your aim;
If you can meet with Triumph and Disaster
And treat those two impostors just the same;
If you can bear to hear the truth you've spoken
Twisted by knaves to make a trap for fools,
Or watch the things you gave your life to, broken,
And stoop and build 'em up with worn-out tools:
If you can make one heap of all your winnings
And risk it all on one turn of pitch-and-toss,
And lose, and start again at your beginnings
And never breath a word about your loss;
If you can force your heart and nerve and sinew
To serve your turn long after they are gone,
And so hold on when there is nothing in you
Except the Will which says to them: "Hold on"
If you can talk with crowds and keep your virtue,
Or walk with kings - nor lose the common touch,
If neither foes nor loving friends can hurt you;
If all men count with you, but none too much,
If you can fill the unforgiving minute
With sixty seconds' worth of distance run,
Yours is the Earth and everything that's in it,
And - which is more - you'll be a Man, my son
ENDL
readability = Readability.new(str)
readability.letter_count.should == 1125
readability.word_count.should == 292
readability.total_syllables.should == 338
readability.sentence_count.should == 1
readability.words_with_three_syllables.should == 6
readability.flesch_kincaid_grade_level.should == 111.9
readability.flesch_kincaid_reading_ease.should == -187.5
readability.gunning_fog_score.should == 117.5
readability.coleman_liau_index.should == 6.9
readability.smog_index.should == 14.1
readability.automated_readability_index.should == 142.7
end
end
end