Диссертация (1137241), страница 22
Текст из файла (страница 22)
Пакет parse_thicket, файлParseCorefsBuilder.public class ParseCorefsBuilder {protected static ParseCorefsBuilder instance;private Annotation annotation;StanfordCoreNLP pipeline;CommunicativeActionsArcBuilderCommunicativeActionsArcBuilder();caFinder=new174/*** singleton method of instantiating the processor** @return the instance*/public synchronized static ParseCorefsBuilder getInstance() {if (instance == null)instance = new ParseCorefsBuilder();return instance;}ParseCorefsBuilder(){Properties props = new Properties();props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse,dcoref");pipeline = new StanfordCoreNLP(props);}public ParseThicket buildParseThicket(String text){List<Tree> ptTrees = new ArrayList<Tree>();// all numbering from 1, not 0List<WordWordInterSentenceRelationArc>ArrayList<WordWordInterSentenceRelationArc>();List<List<ParseTreeNode>>ArrayList<List<ParseTreeNode>>();arcsnodesThicket==newnewannotation = new Annotation(text);try {pipeline.annotate(annotation);List<CoreMap>sentencesannotation.get(CoreAnnotations.SentencesAnnotation.class);=if (sentences != null && sentences.size() > 0)for(CoreMap sentence: sentences){List<ParseTreeNode>ArrayList<ParseTreeNode>();nodes=new175// traversing the words in the current sentence// a CoreLabel is a CoreMap with additional tokenspecific methodsClass<TokensAnnotation>tokenAnn=TokensAnnotation.class;List<CoreLabel>coreLabelList=sentence.get(tokenAnn);int count=1;for (CoreLabel token: coreLabelList ) {// this is the text of the tokenString lemma = token.get(TextAnnotation.class);// this is the POS tag of the tokenStringtoken.get(PartOfSpeechAnnotation.class);pos=// this is the NER label of the tokenStringtoken.get(NamedEntityTagAnnotation.class);nenodes.add(new=ParseTreeNode(lemma,pos,ne,count));count++;}nodesThicket.add(nodes);Treetreesentence.get(TreeCoreAnnotations.TreeAnnotation.class);=ptTrees.add(tree);}} catch (Exception e) {e.printStackTrace();}// now coreferencesMap<Integer,CorefChain>corefsannotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);List<CorefChain>ArrayList<CorefChain>(corefs.values());for(CorefChain c: chains){//System.out.println(c);chains==new176List<CorefMention> mentions = c.getMentionsInTextualOrder();//System.out.println(mentions);if (mentions.size()>1)for(int i=0; i<mentions.size(); i++){for(int j=i+1; j<mentions.size(); j++){CorefMention mi = mentions.get(i), mj=mentions.get(j);int niSentence = mi.position.get(0);int niWord = mi.startIndex;int njSentence = mj.position.get(0);int njWord = mj.startIndex;ArcType arcType = new ArcType("coref-", mj.mentionType+""+mj.animacy, 0, 0);WordWordInterSentenceRelationArc arc =new WordWordInterSentenceRelationArc(newPair<Integer, Integer>(niSentence,niWord),newInteger>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan,Pair<Integer,arcType);arcs.add(arc);/*System.out.println("animacy = "+m.animacy);System.out.println("mention span = "+m.mentionSpan);System.out.println(" id = "+m.mentionID);System.out.println(" position = "+m.position);System.out.println(" start index = "+m.startIndex);System.out.println(" end index = "+m.endIndex);System.out.println(" mentionType = "+m.mentionType);System.out.println(" number = = "+m.number);*/}}177}List<WordWordInterSentenceRelationArc>buildCAarcs(nodesThicket);arcsCA=ParseThicket result = new ParseThicket(ptTrees, arcs);result.setNodesThicket(nodesThicket);return result;}private List<WordWordInterSentenceRelationArc> buildCAarcs(List<List<ParseTreeNode>> nodesThicket) {List<WordWordInterSentenceRelationArc>ArrayList<WordWordInterSentenceRelationArc>();arcs=newfor(int sentI=0; sentI<nodesThicket.size(); sentI++){for(intsentJ=sentI+1;sentJ<nodesThicket.size();sentJ++){List<ParseTreeNode>sentenceI=nodesThicket.get(sentI),sentenceJ=nodesThicket.get(sentJ);Pair<String,caFinder.findCAInSentence(sentenceI);Integer[]>caI=Pair<String,caFinder.findCAInSentence(sentenceJ);Integer[]>caJ=intcaFinder.findCAIndexInSentence(sentenceI);indexCA1=intcaFinder.findCAIndexInSentence(sentenceJ);indexCA2=if (caI==null || caJ==null)continue;Pair<String,Integer[]>caGencaFinder.generalize(caI, caJ).get(0);ArcType arcType = new ArcType("ca",caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);=178WordWordInterSentenceRelationArc arc =newWordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1),newPair<Integer,Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(),arcType);arcs.add(arc);}}return arcs;}private String printNumArray(Integer[] arr){StringBuffer buf = new StringBuffer();for(Integer i: arr){buf.append(Integer.toString(i)+ " ");}return buf.toString();}Выявление риторических связей, построение на их основе иобобщениерасширенныхгрупп.
Пакетrhetoric_structure, файлRhetoricStructureMarker.java.public class RhetoricStructureMarker implements IGeneralizer<Integer[]> {//private static String rstRelations[] = {"antithesis", "concession","contrast", "elaboration"};List<Pair<String,ParseTreeNode[]>>ArrayList<Pair<String, ParseTreeNode[]>>();rstMarkers=newpublic RhetoricStructureMarker(){rstMarkers.add(new Pair<String, ParseTreeNode[]>("contrast",new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("than",",")}));179rstMarkers.add(new Pair<String, ParseTreeNode[]>( "antithesis",newParseTreeNode[]{newParseTreeNode("although",","),newParseTreeNode("*","*") }));rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast",newParseTreeNode[]{newParseTreeNode(",",","),newParseTreeNode("however","*") }));rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast",newParseTreeNode[]{newParseTreeNode("however","*"),newParseTreeNode(",",","),new ParseTreeNode("*","prp"), }));rstMarkers.add(newPair<String,ParseTreeNode[]>("elaboration",newParseTreeNode[]{newParseTreeNode(",",","),newParseTreeNode("*","NN") }));rstMarkers.add(newPair<String,ParseTreeNode[]>("elaboration", new ParseTreeNode[]{new ParseTreeNode("as","*"),newParseTreeNode("a","*") }));rstMarkers.add(newPair<String,ParseTreeNode[]>("explanation", new ParseTreeNode[]{new ParseTreeNode(",",","),new ParseTreeNode("because",",") }));rstMarkers.add(new Pair<String, ParseTreeNode[]>( "example",newParseTreeNode[]{newParseTreeNode("for","IN"),newParseTreeNode("example","NN") }));rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast",new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("ye","*") }));rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast",new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode(",",","),new ParseTreeNode("*","prp"), }));rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast",new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode("i","*"),}));rstMarkers.add(newPair<String,ParseTreeNode[]>("explanation",newParseTreeNode[]{newParseTreeNode(",",","),newParseTreeNode("where","*") }));//as long asrstMarkers.add(newPair<String,ParseTreeNode[]>("temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ newParseTreeNode("*","RB"),new ParseTreeNode("as","IN"),}));180rstMarkers.add(newPair<String,ParseTreeNode[]>("temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ newParseTreeNode("*","VB*"),new ParseTreeNode("until","IN"),}));}/* For a sentence, we obtain a list of markers with the CA word andposition in the sentence* Output span is an integer array with start/end occurrence of an RSTmarker in a sentence* */publicList<Pair<String,Integer[]>>extractRSTrelationInSentenceGetBoundarySpan(List<ParseTreeNode> sentence){List<Pair<String,ArrayList<Pair<String, Integer[]>> ();Integer[]>>results=newfor(Pair<String, ParseTreeNode[]> template: rstMarkers){List<Integer[]>generalize(sentence,template.getSecond() );spanList=if (!spanList.isEmpty())results.add(newInteger[]>(template.getFirst(), spanList.get(0)));Pair<String,}return results;}/* Rule application in the form of generalization* Generalizing a sentence with a rule (a template), we obtain theoccurrence of rhetoric marker** o1 - sentence* o2 - rule/template, specifying lemmas and/or POS, includingpunctuation*@seeopennlp.tools.parse_thicket.IGeneralizer#generalize(java.lang.Object, java.lang.Object)* returns the span Integer[]*/@Override181public List<Integer[]> generalize(Object o1, Object o2) {List<Integer[]> result = new ArrayList<Integer[]>();List<ParseTreeNode> sentence = (List<ParseTreeNode> )o1;ParseTreeNode[] template = (ParseTreeNode[]) o2;boolean bBeingMatched = false;for(intwordIndexInSentence=0;wordIndexInSentence<sentence.size(); wordIndexInSentence++){ParseTreeNodesentence.get(wordIndexInSentence);word=int wordIndexInSentenceEnd = wordIndexInSentence;//init iterators for internal loopint templateIterator=0;whiletemplateIterator< template.length){(wordIndexInSentenceEnd<sentence.size()ParseTreeNode&&tword=template[templateIterator];ParseTreeNodecurrWord=sentence.get(wordIndexInSentenceEnd);List<ParseTreeNode>gRes=tword.generalize(tword, currWord);if(gRes.isEmpty()||gRes.get(0)==null||(gRes.get(0).getWord().equals("*")&&gRes.get(0).getPos().equals("*") )){bBeingMatched = false;break;} else {bBeingMatched = true;}wordIndexInSentenceEnd++;templateIterator++;}// template iteration is done// the only condition for successful match is IF we are atthe end of templateif (templateIterator == template.length){182result.add(newInteger[]{wordIndexInSentence,wordIndexInSentenceEnd-1});return result;}// no match for current sentence word: proceed to the next}return result;}public String markerToString(List<Pair<String, Integer[]>> res){StringBuffer buf = new StringBuffer();buf.append("[");for(Pair<String, Integer[]> marker: res){buf.append(marker.getFirst()+":");for(int a: marker.getSecond()){buf.append(a+" ");}buf.append (" | ");}buf.append("]");return buf.toString();}Выявление,построениекоммуникативныхдействийиобобщение расширенных групп, построенных на их основе.