Диссертация (1137241), страница 24
Текст из файла (страница 24)
Пакет thicket2graph, файлGraphFromPTreeBuilder.java.public class GraphFromPTreeBuilder {publicGraph<ParseGraphNode,buildGraphFromPT(ParseThicket pt){DefaultEdge>PrintWriter out = new PrintWriter(System.out);List<Tree> ts = pt.getSentences();ts.get(0).pennPrint(out);Graph<ParseGraphNode,buildGGraphFromTree(ts.get(0));DefaultEdge>gfragment//ParseTreeVisualizer applet = new ParseTreeVisualizer();//applet.showGraph(gfragment);return gfragment;=195}privateGraph<ParseGraphNode,buildGGraphFromTree(Tree tree) {DefaultEdge>Graph<ParseGraphNode, DefaultEdge> g =newSimpleGraph<ParseGraphNode,DefaultEdge>(DefaultEdge.class);ParseGraphNode root = new ParseGraphNode(tree,"S 0");g.addVertex(root);navigate(tree, g, 0, root);return g;}private void navigate(Tree tree, Graph<ParseGraphNode, DefaultEdge>g, int l, ParseGraphNode currParent) {//String currParent = tree.label().value()+" $"+Integer.toString(l);//g.addVertex(currParent);if (tree.getChildrenAsList().size()==1)navigate(tree.getChildrenAsList().get(0),g,currParent);elseif (tree.getChildrenAsList().size()==0)return;for(Tree child: tree.getChildrenAsList()){String currChild = null;ParseGraphNode currChildNode = null;try {if (child.isLeaf())continue;if (child.label().value().startsWith("S"))l+1,196navigate(child.getChildrenAsList().get(0),g, l+1, currParent);if (!child.isPhrasal() || child.isPreTerminal())currChild=child.toString()+"#"+Integer.toString(l);elsecurrChild=child.label().value()+"#"+Integer.toString(l);currChildNode = new ParseGraphNode(child,currChild);g.addVertex(currChildNode);g.addEdge(currParent, currChildNode);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}navigate(child, g, l+1, currChildNode);}}Вычисление сходства на графах, представляющих чащи разбора.Пакет thicket2graph, файл EdgeProductBuilder.java.public class EdgeProductBuilder {private Matcher matcher = new Matcher();privateParseCorefsBuilderParseCorefsBuilder.getInstance();privateGraphFromPTreeBuilderGraphFromPTreeBuilder();ptBuildergraphBuilder==newpublic Graph<ParseGraphNode[], DefaultEdge>buildEdgeProduct(Graph<ParseGraphNode, DefaultEdge> g1,Graph<ParseGraphNode, DefaultEdge> g2 ){Graph<ParseGraphNode[], DefaultEdge> gp =newDefaultEdge>(DefaultEdge.class);SimpleGraph<ParseGraphNode[],197Set<DefaultEdge> edges1 = g1.edgeSet();Set<DefaultEdge> edges2 = g2.edgeSet();// build nodes of product graphfor(DefaultEdge e1:edges1){for(DefaultEdge e2:edges2){ParseGraphNodeg1.getEdgeSource(e1), sourceE1t = g1.getEdgeTarget(e1);sourceE1s=ParseGraphNodeg2.getEdgeSource(e2), sourceE2t = g2.getEdgeTarget(e2);sourceE2s=if(isNotEmpty(matcher.generalize(sourceE1s.getPtNodes(),&&sourceE2s.getPtNodes()))isNotEmpty(matcher.generalize(sourceE1t.getPtNodes(),sourceE2t.getPtNodes())))gp.addVertex(new{sourceE1s, sourceE1t, sourceE2s, sourceE2t } );ParseGraphNode[]}}Set<ParseGraphNode[]> productVerticesSet = gp.vertexSet();List<ParseGraphNode[]>productVerticesListArrayList<ParseGraphNode[]>(productVerticesSet);=newfor(int i=0; i<productVerticesList.size(); i++){for(int j=i+1; j<productVerticesList.size(); j++){ParseGraphNode[]prodVertexI=ParseGraphNode[]prodVertexJ=productVerticesList.get(i);productVerticesList.get(j);if(bothAjacentOrNeitherAdjacent(prodVertexI,prodVertexJ)){gp.addEdge(prodVertexI, prodVertexJ);}}}198return gp;}/** Finding the maximal clique is the slowest part*/publicCollection<Set<ParseGraphNode[]>>getMaximalCommonSubgraphs(Graph<ParseGraphNode[], DefaultEdge> g){BronKerboschCliqueFinder<ParseGraphNode[],DefaultEdge>finder =newBronKerboschCliqueFinder<ParseGraphNode[],DefaultEdge>(g);Collection<Set<ParseGraphNode[]>>finder.getBiggestMaximalCliques();cliques=return cliques;}privatebooleanbothAjacentOrNeitherAdjacent(ParseGraphNode[]prodVertexI,ParseGraphNode[] prodVertexJ) {List<ParseGraphNode> prodVertexIlist =newArrayList<ParseGraphNode>(Arrays.asList(prodVertexI));List<ParseGraphNode> prodVertexJlist =newArrayList<ParseGraphNode>(Arrays.asList(prodVertexJ));prodVertexIlist.retainAll(prodVertexJlist);return (prodVertexIlist.size()==2 || prodVertexIlist.size()==4);}private boolean isNotEmpty(List<List<ParseTreeChunk>> generalize) {if(generalize!=nullgeneralize.get(0).size()>0)return true;else&&generalize.get(0)!=null&&199return false;}publicCollection<Set<ParseGraphNode[]>>assessRelevanceViaMaximalCommonSubgraphs(String para1, String para2) {// first build PTs for each textParseThicket pt1 = ptBuilder.buildParseThicket(para1);ParseThicket pt2 = ptBuilder.buildParseThicket(para2);// then build phrases and rst arcsGraph<ParseGraphNode,graphBuilder.buildGraphFromPT(pt1);DefaultEdge>g1=Graph<ParseGraphNode,graphBuilder.buildGraphFromPT(pt2);DefaultEdge>g2=gp=Graph<ParseGraphNode[],buildEdgeProduct(g1, g2);DefaultEdge>Collection<Set<ParseGraphNode[]>>getMaximalCommonSubgraphs(gp);col=return col;}Приложение 4В данном приложении приведены основные фрагменты кода (наязыке Java), предназначенного для реализации поиска ответа насложные вопросы с помощью вычисления сходства чащ разбора и ихпроекций для вопроса и потенциальных ответов.Оценка итогового значения релевантности (score) на основерезультатов операции сходства чащ разбора.
Пакет textsimilarity, файлParseTreeChunkListScorer.java.public class ParseTreeChunkListScorer {// find the single expression with the highest scorepublic double getParseTreeChunkListScore(List<List<ParseTreeChunk>> matchResult) {double currScore = 0.0;for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult)for (ParseTreeChunk chunk : chunksGivenPhraseType) {200Double score = getScore(chunk);// System.out.println(chunk+ " => score >>> "+score);if (score > currScore) {currScore = score;}}return currScore;}// get max score per phrase type and then sum uppublic double getParseTreeChunkListScoreAggregPhraseType(List<List<ParseTreeChunk>> matchResult) {double currScoreTotal = 0.0;for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult) {double currScorePT = 0.0;for (ParseTreeChunk chunk : chunksGivenPhraseType) {Double score = getScore(chunk);// System.out.println(chunk+ " => score >>> "+score);if (score > currScorePT) {currScorePT = score;}}// if substantial for given phrase typeif (currScorePT > 0.5) {currScoreTotal += currScorePT;}}return currScoreTotal;}// score is meaningful only for chunks which are results of generalizationpublic double getScore(ParseTreeChunk chunk) {double score = 0.0;int i = 0;201for (String l : chunk.getLemmas()) {String pos = chunk.getPOSs().get(i);if (l.equals("*")) {if (pos.startsWith("CD")) { // number vs number gives high score// although different numbersscore += 0.7;} else if (pos.endsWith("_high")) { // if query modification adds 'high'score += 1.0;} else {score += 0.1;}} else {if (pos.startsWith("NN") || pos.startsWith("NP")|| pos.startsWith("CD") || pos.startsWith("RB")) {score += 1.0;} else if (pos.startsWith("VB") || pos.startsWith("JJ")) {if (l.equals("get")) { // 'common' verbs are not that importantscore += 0.3;} else {score += 0.5;}} else {score += 0.3;}}i++;}return score;}}Переупорядочивание результатов поиска.
Пакет textsimilarity,файл SearchResultsProcessor.java.202public class SearchResultsProcessor extends BingQueryRunner {private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");private ParseTreeChunkListScorerParseTreeChunkListScorer();parseTreeChunkListScorer=newParserChunker2MatcherProcessor sm;WebSearchEngineResultsScraperWebSearchEngineResultsScraper();scraper=new/** Takes a search engine API (or scraped) search results and calculates theparse tree similarity* between the question and each snippet.
Ranks those snippets with higher* similarity score up*/private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,String searchQuery) {List<HitBase> newHitList = new ArrayList<HitBase>();sm = ParserChunker2MatcherProcessor.getInstance();for (HitBase hit : hits) {Stringsnapshot=hit.getAbstractText().replace("<b>...</b>",".").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", "").replace("<b>", "").replace("</b>", "");snapshot = snapshot.replace("</B>", "").replace("<B>", "").replace("<br>", "").replace("</br>", "").replace("...", ".
").replace("|", " ").replace(">", " ");snapshot += " . " + hit.getTitle();Double score = 0.0;try {SentencePairMatchResult matchRes = sm.assessRelevance(snapshot,searchQuery);List<List<ParseTreeChunk>> match = matchRes.getMatchResult();203score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);LOG.finest(score + " | " + snapshot);} catch (Exception e) {LOG.severe("Problem processing snapshot " + snapshot);e.printStackTrace();}hit.setGenerWithQueryScore(score);newHitList.add(hit);}Collections.sort(newHitList, new HitBaseComparable());LOG.info("\n\n ============= NEW ORDER ================= ");for (HitBase hit : newHitList) {LOG.info(hit.toString());}return newHitList;}public void close() {sm.close();}public List<HitBase> runSearch(String query) {List<HitBase> hits = scraper.runSearch(query);hits = calculateMatchScoreResortHits(hits, query);return hits;}public List<HitBase> runSearchViaAPI(String query) {List<HitBase> hits = null;try {List<HitBase> resultList = runSearch(query);204// now we apply our own relevance filterhits = calculateMatchScoreResortHits(resultList, query);} catch (Exception e) {// e.printStackTrace();LOG.info("No search results for query '" + query);return null;}return hits;}}Приложение 5В данном приложении приведены основные фрагменты кода (наязыке Java), предназначенного для построения узорных структур и ихпроекций на чащах разбора и реализации алгоритма кластеризациитекстов.Построение проекции узорной структуры на чащах разбора,алгоритмAddIntent.Пакетpattern_structure,файлPhrasePatternStructure.public class PhrasePatternStructure {int objectCount;int attributeCount;ArrayList<PhraseConcept> conceptList;ParseTreeMatcherDeterministic md;public PhrasePatternStructure(int objectCounts, int attributeCounts) {objectCount = objectCounts;attributeCount = attributeCounts;conceptList = new ArrayList<PhraseConcept>();PhraseConcept bottom = new PhraseConcept();md = new ParseTreeMatcherDeterministic();/*Set<Integer> b_intent = new HashSet<Integer>();for (int index = 0; index < attributeCount; ++index) {b_intent.add(index);205}bottom.setIntent(b_intent);*/bottom.setPosition(0);conceptList.add(bottom);}public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, intGenerator) {boolean parentIsMaximal = true;while(parentIsMaximal) {parentIsMaximal = false;for (int parent : conceptList.get(Generator).parents) {if(conceptList.get(parent).intent.containsAll(intent)) {Generator = parent;parentIsMaximal = true;break;}}}return Generator;}public int AddIntent(List<List<ParseTreeChunk>> intent, int generator){System.out.println("debug");System.out.println("called for " + intent);//printLattice();int generator_tmp = GetMaximalConcept(intent, generator);generator = generator_tmp;if (conceptList.get(generator).intent.equals(intent)) {System.out.println("atconceptList.get(generator).intent);generator:"System.out.println("to add:" + intent);System.out.println("already generated");return generator;}+206Set<Integer>conceptList.get(generator).parents;generatorParents=Set<Integer> newParents = new HashSet<Integer>();for (int candidate : generatorParents) {if (!intent.containsAll(conceptList.get(candidate).intent)){//if (!conceptList.get(candidate).intent.containsAll(intent)){//Set<Integer>HashSet<Integer>(conceptList.get(candidate).intent);intersection=new//List<List<ParseTreeChunk>> intersection = newArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent);//intersection.retainAll(intent);List<List<ParseTreeChunk>> intersection = md.matchTwoSentencesGroupedChunksDeterministic(intent,conceptList.get(candidate).intent);System.out.println("recursive call (inclusion)");candidate = AddIntent(intersection, candidate);}boolean addParents = true;System.out.println("now iterating over parents");Iterator<Integer> iterator = newParents.iterator();while (iterator.hasNext()) {Integer parent = iterator.next();if(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {addParents = false;break;}else {if(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {iterator.remove();}}}/*for (int parent : newParents) {207System.out.println("parent = " + parent);System.out.println("candidateintent:"+conceptList.get(candidate).intent);System.out.println("parentintent:"+conceptList.get(parent).intent);if(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {addParents = false;break;}else {if(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {newParents.remove(parent);}}}*/if (addParents) {newParents.add(candidate);}}System.out.println("size of lattice: " + conceptList.size());PhraseConcept newConcept = new PhraseConcept();newConcept.setIntent(intent);newConcept.setPosition(conceptList.size());conceptList.add(newConcept);conceptList.get(generator).parents.add(newConcept.position);for (int newParent: newParents) {if(conceptList.get(generator).parents.contains(newParent)) {conceptList.get(generator).parents.remove(newParent);}conceptList.get(newConcept.position).parents.add(newParent);}208return newConcept.position;}public void printLatticeStats() {System.out.println("Lattice stats");System.out.println("max_object_index = " + objectCount);System.out.println("max_attribute_index = " + attributeCount);System.out.println("Currentconceptcount="+conceptList.size());}public void printLattice() {for (int i = 0; i < conceptList.size(); ++i) {printConceptByPosition(i);}}public void printConceptByPosition(int index) {System.out.println("Concept at position " + index);conceptList.get(index).printConcept();}publicformGroupedPhrasesFromChunksForPara(List<List<ParseTreeChunk>>List<List<ParseTreeNode>> phrs) {List<List<ParseTreeChunk>>ArrayList<List<ParseTreeChunk>>();results=List<ParseTreeChunk>nps=ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(),pps = new ArrayList<ParseTreeChunk>();for(List<ParseTreeNode> ps:phrs){ParseTreeChunk ch = convertNodeListIntoChunk(ps);String ptype = ps.get(0).getPhraseType();if (ptype.equals("NP")){nps.add(ch);} else if (ptype.equals("VP")){vps.add(ch);} else if (ptype.equals("PP")){pps.add(ch);}newnew209}results.add(nps); results.add(vps); results.add(pps);return results;}privateconvertNodeListIntoChunk(List<ParseTreeNode> ps) {ParseTreeChunkList<String> lemmas = new ArrayList<String>(), poss = newArrayList<String>();for(ParseTreeNode n: ps){lemmas.add(n.getWord());poss.add(n.getPos());}ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);ch.setMainPOS(ps.get(0).getPhraseType());return ch;}}Приложение 6В данном приложении приведены основные фрагменты кода (наязыке Java), применявшегося для обучения на текстовых абзацах.Подготовка запросов для экспериментов.