Stefanie Dipper - TIGERSearch templates for searching topological fields

TIGERSearch templates for searching topological fields

// File: topological_fields_tlt13.tig

//////////////////////////////////////////
// TEMPLATES FOR TIGER-SCHEME CORPORA
//////////////////////////////////////////

// This file contains templates for querying topological fields
// in the TIGER corpus, as used and described in Dipper (2014), 
// Proceedings of TLT13.
//
// Most templates apply to the "context-free" version of TIGER,
// i.e. without crossing branches (= templates with suffix _cf).
// Only the VF template comes in two versions, one for the context-free 
// and one for the version with crossing branches (= suffix _enr).
// The templates have been tested on an "enriched" version of TIGER,
// i.e. one with unary nodes (e.g. all pronouns are dominated by an NP)
//
// The file contains
// 1. basic templates
// - precedence relations
// - left-corner relation (for non-terminals)
// 2. topological templates
// - VF (including V2)
// - LK
// - RK
// - MF
// - NF

//////////////////////////////////////////
// Basic templates
//////////////////////////////////////////

// Precedence relation
// - right corner of #x precedes left corner of #y
// - covers non-terminal nodes
// - allows for intervening quotes
prec(#x,#y) <-
  ( // 1. #x is a terminal node
    #x: [word=/.*/] 
  & #x . #y
  | // 2. #x is non-terminal
    #x: [cat=/.*/]
  & #x >@r #xchildR
  & #xchildR . #y
  | // 3. quotes may intervene (everywhere)
    prec_quote(#x,#y)
  )
;

prec_quote(#x,#y) <-
  // like precedence, but quotes may intervene
  #quote: [word=("\""|"``"|"''")]
&
( // 1. #x is a terminal node
  #x: [word=/.*/] 
  & #x . #quote
  & #quote . #y
| // 2. #x is non-terminal
  #x: [cat=/.*/]
  & #x >@r #xchildR
  & #xchildR . #quote
  & #quote . #y
)
;

prec_comma(#x,#y) <-
  // like precedence, but commas may intervene
  #comma: [word="\,"]
&
( // 1. #x is a terminal node
  #x: [word=/.*/] 
  & #x . #comma
  & #comma . #y
| // 2. #x is non-terminal
  #x: [cat=/.*/]
  & #x >@r #xchildR
  & #xchildR . #comma
  & #comma . #y
)
;

// Left-corner dominance
// like >@l but can be used with nonterminals
hasLeftChild(#x,#y) <-
  ( // #y is a terminal node
    #x >@l #y
  | // #y is a non-terminal
    #y >* #z 
  & #x >@l #z
  )
;

//////////////////////////////////////////
// Vorfeld constituent
//////////////////////////////////////////

// For efficiency reasons broken up into two parts:
// 1. VFmain covers VF (and V2) in main clauses
// 2. VFsub covers VF in subord. clauses

// 1. VF (and V2) in main clauses
// Note: not all VF constituents are non-terminals
// e.g. 'hinzu/PTKVZ kommen/VVFIN einige...'
VFmain_cf(#vf,#v2) <-
  #s: [cat="S"]
& #v2: [pos=/V.FIN/]    // #v2: Verb in second position
& #s > #vf              // #vf: Vorfeld constituent
& #s >HD #v2

// VF is first constituent
& ( // 1. VF is very first element in the sentence
    hasLeftChild(#s,#vf)   // #vf is left-most child
  | // 2. Or some coordinating conjunction precedes VF
    #s >@l #conj
  & [] >JU #conj
  & prec(#conj,#vf)
  )

// VF precedes VFIN
& ( // 1. VF directly precedes V2
    prec(#vf,#v2)
  | // 2. A comma may intervene after clausal or appositive VF
    ( #vf: [cat=("S"|"VP")]           // either VF itself precedes comma
    & prec_comma(#vf,#v2)
    | #vf >* #clause_app              // or some embedded constituent
    & ( #clause_app: [cat=("S"|"VP")]
      | [] >APP #clause_app
      )
    & prec_comma(#clause_app,#v2)
    )
  )
;

// 2. VF in subordinate clauses
VFsub_cf(#vf) <-
  #s: [cat="S"]
& #s > #vf               // #vf: Vorfeld constituent
& // VF is very first element in the sentence
  hasLeftChild(#s,#vf)   // #vf is left-most child
& #vf >* [pos=/.*(REL|W).*/]  // relative or interrogative elements
;


// VF, version for crossing branches
VFmain_enr(#vf,#v2) <-
// VF contains discontinuous element -> take daughter node
  #s: [cat="S"]
& #v2: [pos=/V.FIN/]
& #s > #vfin
& #s >* #vf_disc// #vf_disc: disontinuous mother of VF constituent
& discontinuous(#vf_disc)
& #vf_disc > #vf

// VF is first constituent
&  ( // 1. VF is very first element in the sentence
     hasLeftChild(#s,#vf)   // #vf is left-most child
   | // 2. Some coordinating conjunction precedes VF
     #s >@l #conj
   & [] >JU #conj
   & prec(#conj,#vf)
   )

// VF precedes VFIN
& ( // 1. VF directly precedes V2
    prec(#vf,#v2)
  | // 2. A comma may intervene after clausal or appositive VF
    ( #vf: [cat=("S"|"VP")]           // either VF itself precedes comma
    & prec_comma(#vf,#v2)
    | #vf >* #clause_app              // or some embedded constituent
    & ( #clause_app: [cat=("S"|"VP")]
      | [] >APP #clause_app
      )
    & prec_comma(#clause_app,#v2)
    )
  )
;

//////////////////////////////////////////
// Left Bracket + Verb second
//////////////////////////////////////////

// Note: LK only covers subordinating conjunctions
// LK filled by V2 has to be searched via the template VFmain!

LK_cf(#lk) <-
  // filled by subordinating conjunction
  [] > #lk: [pos=/KOUS|KOUI/]
;


//////////////////////////////////////////
// Right Bracket / Verb cluster
//////////////////////////////////////////

// Template broken up into three s not terminate in TIGERSearch
MF_cf_orig(#mf1,#mf2) <-
  #mf1: [cat=/.*/]  // some constituent
& #mf2: [cat=/.*/]  // some constituent
& #lk: [] & LK_cf(#lk)
& #rk: [] & RK_cf(#rk)
& prec(#lk,#mf1)
& prec(#mf2,#rk)
;


//////////////////////////////////////////
// Nachfeld constituents
//////////////////////////////////////////

// The template only marks the beginning of NF (NFB)
// Note: the template uses a very simple heuristics!
// Template comes in two versions
// 1. NFB marks the first word of NF
// 2. NFBconst marks the first constituent of NF
//    (may result in spurious ambiguities)

NFB_cf(#nfB) <-
  ( #rk: [pos=/(V.INF|V.IZU|V.PP|PTKVZ)/]
  | #v: [pos=/(V.INF|V.IZU|V.PP)/]
  & #rk: [pos=/V.FIN/]   // fin only allowed after other verbs
  & #v . #rk  
  )
  // nf should not include any part of RK
& #nfB: [pos!=/(V.FIN|V.INF|V.IZU|V.PP|PTKZU|PTKVZ)/ & pos!=/\$.*/]
& ( // 1. RK directly precedes NF
    prec(#rk,#nfB)
  | // 2. A comma may intervene
    prec_comma(#rk,#nfB)
  )
;

NFBconst_cf(#nfB) <-
  ( #rk: [pos=/(V.INF|V.IZU|V.PP|PTKVZ)/]
  | #v: [pos=/(V.INF|V.IZU|V.PP)/]
  & #rk: [pos=/V.FIN/]   // fin only allowed after other verbs
  & #v . #rk  
  )
  // nf: some constituent
& #nfB: [cat=/.*/]
& ( // 1. RK directly precedes NF
    prec(#rk,#nfB)
  | // 2. A comma may intervene
    prec_comma(#rk,#nfB)
  )
;