#!/usr/bin/perl # © 2010 Graham Shaw. # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. This file is offered as-is, # without any warranty. # This script attempts to isolate adjectives and adverbs from a body # of text. The text is read from stdin, and the output written to # stdout: # # ./find-modifiers.pl < text.txt > modifiers.csv # # The threshold (the minimum number of occurrances needed to trigger # a match) can be changed using the -t option. By default it is set # to one. use Getopt::Std; my %opts; getopts('t:i:',\%opts); my $threshold = 1; if (defined $opts{'t'}) { $threshold = $opts{'t'}; } my $hedge = "sightly|somewhat|moderately|fairly|very|extremely"; my %words = (); my %modifiers = (); while () { chomp; my $line = $_; my @words = split /[^\p{L}\p{N}]/,$line; my $word2=undef; foreach my $word (@words) { ++$words{$word}; if ($word =~ /^[a-z]+$/) { if ($word2 =~ /^($hedge)$/) { ++$modifiers{$word}; } } $word2=$word; } } foreach my $adjective (sort keys %modifiers) { next unless $adjective =~ /[aeiouy]/; my $alform = substitute($adjective,['e?$','al']); next if ((exists $words{$alform}) && ($words{$alform}>=2)); if ($modifiers{$adjective}>=$threshold) { check_paradigm($adjective,['$','ly']); check_paradigm($adjective,['e?$','ally']); } } sub substitute { my $text = shift; my @substitutions = @_; foreach my $substitution (@substitutions) { my ($pattern,$replacement) = @$substitution; my $result = $text; my $expression = "\$result =~ s/$pattern/$replacement/"; my $found = eval "\$result =~ s/$pattern/$replacement/"; if ($found) { return $result; } } return undef; } sub check_paradigm { my $adjective = shift; my @substitutions = @_; my $adverb = substitute($adjective,@substitutions); if ((exists $modifiers{$adverb}) && ($modifiers{$adverb}>=$threshold)) { print "$adjective($modifiers{$adjective}),$adverb($modifiers{$adverb})\n"; } }