File size: 1,050 Bytes
0e5da39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/bash

[ $# -ge 2 ] || { echo "Usage: $0 target_dir layer files" >&2; exit 1; }

target="$1"; shift
layer="$1"; shift

resources="$(dirname $0)/resources"

for file in "$@"; do
  treex -Lcs Read::PDT schema_dir=$resources from=$file.$layer top_layer=$layer Write::CoNLLU to=$target/$file.conllu use_tree_id_as_sent_id=1 upos=is_parenthesis_root xpos=tag feats=is_member deprel=afun
  gawk -i inplace '
    BEGIN {s = 1; p = 1; last_pid = -1}
    /^# sent_id = / {
      split($4, parts, /-/); p_s = parts[length(parts)]; if (match(p_s, /^p[0-9]+s[0-9]+$/) > 0) {
        pid = substr(p_s, 0, index(p_s, "s")); if (pid != last_pid) { print "# newpar id = '"${file//\//-}"'-p" p; p += 1; last_pid = pid}}
      $4 = "s" s; s += 1}
    {print}
  ' $target/$file.conllu
  sed '
    1i# newdoc id = '"${file//\//-}"'
    s@^# sent_id = @# sent_id = '"${file//\//-}"'-@
  ' -i $target/$file.conllu
  if [ "$layer" = m ]; then
    sed '
      s/^\([0-9][0-9]*\t\([^\t]*\t\)\{5\}\)0\(\(\t[^\t]*\)\{3\}\)$/\1_\3/
    ' -i $target/$file.conllu
  fi
done