#!/usr/bin/perl use strict; use warnings; use Data::Dumper; use Getopt::Std; #use XML::Simple; use Term::ANSIColor qw(:constants); our %opts = ( b => 1, d => 0, f => 0, n => 0, s => 0, x => 0, k => 0, ); getopts('hbdnfsxko:', \%opts); my $usage = <<"__USAGE__"; swb-transform: Script to filter the Switchboard Corpus, outputs Penn Treebank bracket annotation usage: $0 [-options] inputfile Output options: -b bracket annotation output (default) -d dan bikel parser output: '((word1 (pos1)) ... (wordN (posN)))' Filtering options: -f NoDisfluencies (full filtering, remove all disfluencies and non-word disfluency markup) -s NoMarkup (some filtering, remove all non-word disfluency markup) -n Original (minimal filtering, remove punctuation and traces only) Other options: -x insert extra parentheses -k keep: print line number which are kept after filtering (starting from 0) -o keep_file: output lines which are found in keep_file (e.g. output from '-k') __USAGE__ my $tree_count = -1; my $keep = {}; if ( $opts{h} || scalar @ARGV != 1 ) { print $usage; exit; } if ( $opts{d} ) { $opts{b} = 0; } if ( $opts{o} ) { open KEPT, $opts{o} || die; while () { chomp; $keep->{$_} = 1; } close KEPT; } # Get IN file my $in = shift @ARGV; # Traverse corpus open IN, $in; my $tree = ""; while () { chomp; if ( $_ !~ /^\s*[\(\)]/ || $_ =~ /^\s*\( \(CODE/ ) { next; } elsif ( /^\(/ ) { print_tree( $tree ); $tree = $_; } else { $tree = add_to_tree( $tree, $_ ); } } close IN; sub add_to_tree { my ( $tree, $line ) = @_; # Append line to tree $tree .= $line; return $tree; } sub get_words { my ( $line ) = @_; my @wds = (); while ( $line =~ /\([^\(\)\s]+ [^\(\)\s]+\)/ ) { $line =~ s/\(([^\(\)\s]+) ([^\(\)\s]+)\)//; push @wds, [ $2, $1 ]; } return \@wds; } sub print_tree { my ( $tree ) = @_; $tree_count++; return undef unless $tree; $tree =~ s/^\(\s*(.*)\)$/$1/; $tree = remove_spaces( $tree ); # Need this for DB Parser $tree =~ s/\-NONE\-/NONE/g; # Map -DFL- to DFL (maltparser requires this...) $tree =~ s/\-DFL\-/DFL/g; my @remove; if ( $opts{n} ) { # REMOVE NOTHING } if ( $opts{s} || $opts{f}) { # remove traces push @remove, 'NONE'; # remove puncuation push @remove, '\"'; push @remove, '\:'; push @remove, '\,'; push @remove, '\.'; # remove DFL push @remove, 'DFL'; } if ( $opts{f} ) { # REMOVE SPEECH RELATED NODES # remove uh interjections push @remove, 'UH'; # remove repairs push @remove, 'EDITED'; # remove unanalyzable nodes push @remove, 'X'; # remove parenthenticals push @remove, 'PRN'; # remove trees with unfinished as last node return undef if $tree =~ /\([A-Z\-]+\-UNF/; } if ( $opts{x} ) { # REMOVE EDGE LABELS $tree =~ s/(\(\w+)\-[\w\d\-]+/$1/g; } $tree = remove_nodes( \@remove, $tree ); my $words = get_words( $tree ); return undef unless scalar @$words >= 2; $tree = remove_empty_nodes( $tree ); $tree = remove_spaces( $tree ); if ( $opts{k} ) { print "$tree_count\n"; } elsif ( ( ! $opts{o} ) || $keep->{$tree_count} ) { if ( $opts{b} ) { # Add space after node label $tree =~ s/\(\(/\( \(/g; if ( $opts{x} ) { print "( $tree)\n"; } else { print "$tree\n"; } } elsif ( $opts{d} ) { my $res = join " ", map { "(".$_->[0]." (".$_->[1]."))" } @{ $words }; print "($res)\n"; } } return undef; } sub remove_empty_nodes { my ( $tree ) = @_; while ( $tree =~ /\([^\s\(]+\s*\)/ ) { $tree =~ s/\([^\s\(]+\s*\)/ /g; } return $tree; } sub remove_punctuation { my ( $tree ) = @_; $tree =~ s/\([\,\.\?\:\;\:\!] [\,\.\?\:\;\:\!]\)//g; return $tree; } sub remove_spaces { my ( $tree ) = @_; $tree =~ s/\n/ /gs; $tree =~ s/\s+\)/\)/g; $tree =~ s/\(\s+/\(/g; $tree =~ s/^\s+//; $tree =~ s/\s+$//; $tree =~ s/\s\s+/ /g; return $tree; } sub remove_nodes { my ( $pos_aref, $tree ) = @_; foreach my $pos ( @$pos_aref ) { # print "\nBEFORE ($pos)\n\t$tree\n"; if ( $tree =~ /(\($pos) / ) { while ( $tree =~ /(\($pos)/ ) { $tree = remove_single_node( index($tree, "$1" ), $tree ); } # print "\nAFTER ($pos)\n\t$tree\n"; } } return $tree; } sub remove_single_node { my ( $index, $tree ) = @_; my $before = substr($tree, 0, $index); my $after = substr($tree, $index); my @chars = (split '', $after); my $brackets = 0; for (my $i=0;$i<=$#chars;$i++) { my $c = $chars[$i]; if ( $c eq '(' ) { $brackets++; } elsif ( $c eq ')' ) { $brackets--; } if ( $brackets == 0 ) { my $new_tree = "$before" . join ('', @chars[$i+1..$#chars]); return $new_tree; } } die "Could not remove node: $!"; return undef; }