Диссертация (1137241), страница 26
Текст из файла (страница 26)
").replace("<span class='best-phrase'>", "").replace("<span>", " ").replace("<span>", " ").replace("<b>","").replace("</b>", "");snippet=snippet.replace("</B>","").replace("<B>", "").replace("<br>","").replace("</br>", "").replace("...", ".
").replace("|","").replace(">", " ").replace(". .", ". ");searchResultText = hit.getTitle() + " " +snippet;}if (flag != -1) {//newHitList.add(hit);if (count <= (int) (hits.size() * lower_threshold)|| count >= (int) (hits.size() *upper_threshold)) {treeBankBuffer.addAll(formTreeKernelStructure(searchResultText,count,hits));}//middle 10 are used for classificationif (count > (int) (hits.size() * lower_threshold)&& count < (int) (hits.size() *upper_threshold)&& rnd.nextFloat() <= ((float) (1) upper_threshold + lower_threshold)*(upper_threshold - lower_threshold)) {treeBankClassifyBufferratio/220.addAll(formTreeKernelClassifyStructure(searchResultText));newHitList.add(hit);}}}}// write the lits of samples to a fileProfileReaderWriter.writeReport(treeBankBuffer,path+trainingFileName, ' ');//add examples to log/*String[] arrQuery = new String[1];arrQuery[0] = "Query = " + query;treeBankBuffer.add(0, arrQuery);treeBankBuffer.add(new String[] {" "});ProfileReaderWriter.appendReport(treeBankBuffer,detailedLearningOutput);// build the modeltkRunner.runLearner(path, trainingFileName, modelFileName);*/// now we preparing the same answers to be classifies in/out/*treeBankBuffer = new ArrayList<String[]>();for (HitBase hit : newHitList) {// not original docs now but instead a snippetString searchResultTextAbstr = hit.getAbstractText();Stringsnippet=searchResultTextAbstr.replace("<b>...</b>", ".
").replace("<span class='best-phrase'>"," ").replace("<span>", " ").replace("<span>", " ").replace("<b>", "").replace("</b>", "");snippet = snippet.replace("</B>", "").replace("<B>", "").replace("<br>","").replace("</br>","").replace("...", ". ").replace("|", " ").replace(">", " ").replace("..", ". ");snippet = hit.getTitle() + " " + snippet;221ParseThicketmatcher.buildParseThicketFromTextWithRST(snippet);pt=//hit.getPageContent());List<Tree> forest = pt.getSentences();// we consider the snippet as a single sentence to beclassifiedif (forest.size()>0){treeBankBuffer.add(newString[]{"0|BT|"+forest.get(0).toString()+ " |ET|"});newHitListReRanked .add(hit);}}*/ProfileReaderWriter.writeReport(treeBankClassifyBuffer,path+unknownToBeClassified, ' ');tkRunner.runClassifier(path,modelFileName, classifierOutput);unknownToBeClassified,// read classification resultsList<String[]>classifResultsProfileReaderWriter.readProfiles(path+classifierOutput, ' ');=HitBase h;// iterate through classification results and set them as scores forhits//newHitList = new ArrayList<HitBase>();for(int i=0; i < newHitList.size() && i < classifResults.size() ;i++){String scoreClassif = classifResults.get(i)[0];float val = Float.parseFloat(scoreClassif);h = newHitList.get(i);h.setGenerWithQueryScore(((double) val));newHitList.set(i, h);}// sort by SVM classification results222Collections.sort(newHitList, new HitBaseComparable());System.out.println("\n\n================= ");=============NEWORDERfor (HitBase hit : newHitList) {if (!(onlySnippets)){System.out.println(hit.getOriginalSentences().toString()+ " => "+hit.getGenerWithQueryScore());System.out.println("pagecontent="+hit.getPageContent());}System.out.println("title = "+hit.getAbstractText());System.out.println("snippet = "+hit.getAbstractText());System.out.println("match = "+hit.getSource());}return newHitList;}protectedList<String[]>searchResultText, int count, List<HitBase> hits) {formTreeKernelStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {// if from the top of ranked docs, then positive, if from thebottom - negativeString posOrNeg = null;if (count<=(int) (hits.size() * lower_threshold))posOrNeg=" 1 ";else if (count >= (int) (hits.size() * upper_threshold))posOrNeg=" -1 ";elseposOrNeg=" 0 ";//middle for classification// form the list of training samplesif (posOrNeg != " 0 "){// get the parses from original documents, andform the training dataset223ParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<Tree> forest = pt.getSentences();String[] sent = new String[1];sent[0] = posOrNeg;for(Tree t: forest){//treeBankBuffer.add(newString[]{posOrNeg+" |BT| "+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}if (sent[0] == posOrNeg){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}protectedsearchResultText) {List<String[]>formTreeKernelClassifyStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {ParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);List<Tree> forest = pt.getSentences();String[] sent = new String[1];sent[0] = " 0 ";for(Tree t: forest){=224//treeBankBuffer.add(new String[] {" 0 " + |BT|"+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}if (sent[0] == " 0 "){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}Обучение и классификация на лесе расширенных деревьев.Пакетkernel_interface,файлMultiSentenceKernelBasedExtendedForestSearchResultsProcessor(данный класс наследует класс, описанный в предыдущем файле).public class MultiSentenceKernelBasedExtendedForestSearchResultsProcessorextends MultiSentenceKernelBasedSearchResultsProcessor{private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor");protected TreeExtenderByAnotherLinkedTree treeExtender = newTreeExtenderByAnotherLinkedTree();protectedList<String[]>searchResultText, int count, List<HitBase> hits) {formTreeKernelStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {// if from the top of ranked docs then positive, if from thebottom - negative225String posOrNeg = null;if (count<=(int) (hits.size() * lower_threshold))posOrNeg=" 1 ";else if (count >= (int) (hits.size() * upper_threshold))posOrNeg=" -1 ";elseposOrNeg=" 0 ";//middle for classification// form the list of training samplesif (posOrNeg != " 0 "){// get the parses from original documents, andform the training datasetParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<String>treeExtender.buildForestForCorefArcs(pt);=extendedTreesDumpString[] sent = new String[1];sent[0] = posOrNeg;List<Tree> forest = pt.getSentences();for(Tree t: forest){//treeBankBuffer.add(new String[] {" 0 " +|BT| "+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}//adding trees with semantic arcsfor(String t: extendedTreesDump){//treeBankBuffer.add(new{posOrNeg+" |BT| "+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t;}if (sent[0] == posOrNeg){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);String[]226}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}protectedsearchResultText) {List<String[]>formTreeKernelClassifyStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {ParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<String>treeExtender.buildForestForCorefArcs(pt);=extendedTreesDumpString[] sent = new String[1];sent[0] = " 0 ";List<Tree> forest = pt.getSentences();for(Tree t: forest){//treeBankBuffer.add(new String[] {" 0 " + |BT|"+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}//adding trees with semantic arcsfor(String t: extendedTreesDump){sent[0] = sent[0] + " |BT| " + t;}if (sent[0] == " 0 "){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);227} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}Подготовка запросов на основе YahooAnswers.
Пакет apps, файлYahooAnswersMiner.public class YahooAnswersMiner extends BingQueryRunner{private static final Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.YahooAnswersMiner");private int page = 0;private static final int hitsPerPage = 50;public List<HitBase> runSearch(String query) {aq.setAppid(BING_KEY);aq.setQuery("site:answers.yahoo.com "+query);aq.setPerPage(hitsPerPage);aq.setPage(page);aq.doQuery();List<HitBase> results = new ArrayList<HitBase> ();AzureSearchResultSet<AzureSearchWebResult>aq.getQueryResult();ars=228for (AzureSearchWebResult anr : ars){HitBase h = new HitBase();h.setAbstractText(anr.getDescription());h.setTitle(anr.getTitle());h.setUrl(anr.getUrl());results.add(h);}page++;return results;}public List<HitBase> runSearch(String query, int totalPages) {int count=0;List<HitBase> results = new ArrayList<HitBase>();while(totalPages>page*hitsPerPage){List<HitBase> res = runSearch(query);results.addAll(res);if (count>10)break;count++;}return results;}229Приложение 7В данном приложении приведены ключевые фрагменты кода (наязыке Python), использовавшегося для реализации метода выявлениятождественных денотатов и проведения экспериментов на прикладнойонтологии и сгенерированных формальных контекстах.Генерация данных.