#!/usr/bin/perl use strict; use warnings; use Data::Dumper; use Getopt::Std; our %opts = ( ); getopts('hxi:f:o:s:', \%opts); my $usage = <<"__USAGE__"; forwardsearch takes as input a C4.5 file, removes features one by one and decides whether the overall result improves usage: $0 -i inputfile [-f featuremap] [-o outputfile] -i inputfile : file with C4.5 Timbl feature representation -f featuremap: file with tab separated feature name and position, one line per name-position pair -o outputfile: name of output file. Default: .forward.out -s settings : Timbl settings separated by ':', e.g. 'mM:k3' -x : run with all features only (for testing) __USAGE__ if ( $opts{h} || ! $opts{i} ) { print STDERR Dumper(%opts);; print STDERR $usage; exit; } elsif ( ! $opts{o} ) { $opts{o} = $opts{i} . '.forward.out'; } my %features; my %numeric; if ( $opts{f} ) { open FF, $opts{f}; while () { chomp; my ( $val, $key, $numeric ) = split /\t/; $features{$key} = $val; if ( $numeric ) { $numeric{$key} = 1; } } close FF; } my %settings = ( f => $opts{i}, k => 3, t => 'leave_one_out', m => 'M', v => 's', ); my $numeric = ':N' . join(',', keys %numeric ); if ( $opts{s} ) { foreach ( split /:/, $opts{s} ) { /^(\w)(.*)$/; $settings{$1} = $2; } } my $settings = join (" ", map { "-$_$settings{$_}" } grep { /^[^mN]$/ } keys %settings) . " -m$settings{m}:I"; my $timbl_init = "Timbl -% $settings$numeric"; print STDERR "'$timbl_init'\n\n"; my $init = `$timbl_init`; $init =~ /Feature Permutation based on GainRatio\/Values :\n<\s*([\d\s\,]+)\s*>/; my @ignore = split /,/, $1; map { s/\s//g; } @ignore; $init =~ /overall accuracy:\s+(\S+)/; my $init_best = $1; unless ( $opts{f} ) { foreach my $f ( @ignore ) { $features{$f} = $f; } } my @keep; my $best = 0; unless ( $opts{x} ) { for (my $i = 0; $i < scalar @ignore; $i++ ) { my $key = $ignore[$i]; my @tmp = grep { $_ > 0 && $_ ne $key } @ignore; $key =~ s/\s+//g; next unless $key; my $ignore = ""; if ( scalar @tmp ) { $ignore = join(',', @tmp); } my @tmp_numeric; foreach my $n ( keys %numeric ) { unless ( grep { $n == $_ } @tmp ) { push @tmp_numeric, $n; } } my $numeric = ':N' . join(',', @tmp_numeric ); my $timbl = "Timbl -% $settings$ignore$numeric"; print STDERR "'$timbl'\n\n"; my $res = `$timbl`; if ( $res =~ /overall accuracy:\s+(\S+)\s+\((\d+)\/(\d+).*?(\d+)/ ) { my ( $ratio, $found, $total, $exact ) = ( $1, $2, $3, $4 ); $best = 0.0000000 if $best == 0; my $diff = $ratio-$best; $diff = 0.0000000 if $diff == 0; print STDERR "'$features{$key}':\nRatio: $ratio\nDiff: $diff\nKeep? ", $diff>0?'yes':'no', "\n(Kept features:", join ( ", ", map { "'$features{$_}'" } @keep ), ")\n\n"; if ( $ratio > $best ) { $best = $ratio; push @keep, $ignore[$i]; $ignore[$i] = 0; } } } open IN, $opts{i}; open OUT, ">$opts{o}"; while () { chomp; s/\.$//; my @feats = split /,/; my $line = ''; foreach my $i ( grep { $_ } @keep ) { $line .= "$feats[$i-1],"; # $i-1 beacuse timbl's vectors start at '1' } $line .= "$feats[-1]\n"; print OUT $line; } close IN; close OUT; } my $kept = join ( ", ", map { "'$features{$_}'" } @keep ); my $ignored = join ( ", ", map { "'$features{$_}'" } grep { $_ } @ignore ); my $diff = $best - $init_best; print STDERR <<"RES"; Final Results: Kept: $kept Ignored: $ignored Initial Accuracy: $init_best Final Accuracy: $best Improvement: $diff New feature vectors printed to output file '$opts{o}' RES