|
#!/bin/bash |
|
|
|
[ $# -ge 2 ] || { echo "Usage: $0 target_dir layer files" >&2; exit 1; } |
|
|
|
target="$1"; shift |
|
layer="$1"; shift |
|
|
|
resources="$(dirname $0)/resources" |
|
|
|
for file in "$@"; do |
|
treex -Lcs Read::PDT schema_dir=$resources from=$file.$layer top_layer=$layer Write::CoNLLU to=$target/$file.conllu use_tree_id_as_sent_id=1 upos=is_parenthesis_root xpos=tag feats=is_member deprel=afun |
|
gawk -i inplace ' |
|
BEGIN {s = 1; p = 1; last_pid = -1} |
|
/^# sent_id = / { |
|
split($4, parts, /-/); p_s = parts[length(parts)]; if (match(p_s, /^p[0-9]+s[0-9]+$/) > 0) { |
|
pid = substr(p_s, 0, index(p_s, "s")); if (pid != last_pid) { print "# newpar id = '"${file//\//-}"'-p" p; p += 1; last_pid = pid}} |
|
$4 = "s" s; s += 1} |
|
{print} |
|
' $target/$file.conllu |
|
sed ' |
|
1i# newdoc id = '"${file//\//-}"' |
|
s@^# sent_id = @# sent_id = '"${file//\//-}"'-@ |
|
' -i $target/$file.conllu |
|
if [ "$layer" = m ]; then |
|
sed ' |
|
s/^\([0-9][0-9]*\t\([^\t]*\t\)\{5\}\)0\(\(\t[^\t]*\)\{3\}\)$/\1_\3/ |
|
' -i $target/$file.conllu |
|
fi |
|
done |
|
|