Диссертация (1137218), страница 27
Текст из файла (страница 27)
Since searchresults are ranked, we set the top-20 as positive set,//and the bottom-20 as negative set.// after re-classification, being re-ranked, the search results mightend up in a different setList<String[]> treeBankBuffer = new ArrayList<String[]>();List<String[]>ArrayList<String[]>();treeBankClassifyBuffer=newString snippet;Random rnd = new Random();int count = 0;int flag = 0;for (HitBase hit : hits) {count++;flag = 0;// if orig content has been already set in HIT object, ok;otherwise set itif(!(hit.getUrl().matches(".*\\.(doc[xm]|doc|DOC[XM]|DOC|jar|JAR|XLS|xls)$"))){//if (hit.getUrl().contains("ATTACHMENT01")){String searchResultText;if (!(onlySnippets)) {searchResultText = hit.getPageContent();if (searchResultText == null) {try {String[]pageSentsAndSnippet = formTextForReRankingFromHit(hit);searchResultTextpageSentsAndSnippet[0];hit.setPageContent(searchResultText);} catch (Exception e) {=228// skip if we are not able tofetch page contente.printStackTrace();flag = -1;} catch (NoClassDefFoundErrore1) {e1.printStackTrace();flag = -1;} catch (NoSuchMethodError e1) {// skip if we are not able tofetch page contente1.printStackTrace();flag = -1;}}} else {//getting snippetssearchResultText = hit.getAbstractText();snippet=searchResultText.replace("<b>...</b>", ".
").replace("<span class='best-phrase'>", "").replace("<span>", " ").replace("<span>", " ").replace("<b>","").replace("</b>", "");snippet=snippet.replace("</B>","").replace("<B>", "").replace("<br>","").replace("</br>", "").replace("...", ". ").replace("|","").replace(">", " ").replace(". .", ".
");searchResultText = hit.getTitle() + " " +snippet;}if (flag != -1) {//newHitList.add(hit);if (count <= (int) (hits.size() * lower_threshold)|| count >= (int) (hits.size() *upper_threshold)) {treeBankBuffer.addAll(formTreeKernelStructure(229searchResultText,count,hits));}//middle 10 are used for classificationif (count > (int) (hits.size() * lower_threshold)&& count < (int) (hits.size() *upper_threshold)&& rnd.nextFloat() <= ((float) (1) upper_threshold + lower_threshold)*ratio/(upper_threshold - lower_threshold)) {treeBankClassifyBuffer.addAll(formTreeKernelClassifyStructure(searchResultText));newHitList.add(hit);}}}}// write the lits of samples to a fileProfileReaderWriter.writeReport(treeBankBuffer,path+trainingFileName, ' ');//add examples to log/*String[] arrQuery = new String[1];arrQuery[0] = "Query = " + query;treeBankBuffer.add(0, arrQuery);treeBankBuffer.add(new String[] {" "});ProfileReaderWriter.appendReport(treeBankBuffer,detailedLearningOutput);// build the modeltkRunner.runLearner(path, trainingFileName, modelFileName);*/// now we preparing the same answers to be classifies in/out/*treeBankBuffer = new ArrayList<String[]>();for (HitBase hit : newHitList) {// not original docs now but instead a snippet230String searchResultTextAbstr = hit.getAbstractText();Stringsnippet=searchResultTextAbstr.replace("<b>...</b>", ".
").replace("<span class='best-phrase'>"," ").replace("<span>", " ").replace("<span>", " ").replace("<b>", "").replace("</b>", "");snippet = snippet.replace("</B>", "").replace("<B>", "").replace("<br>","").replace("</br>","").replace("...", ". ").replace("|", " ").replace(">", " ").replace("..", ".
");snippet = hit.getTitle() + " " + snippet;ParseThicketmatcher.buildParseThicketFromTextWithRST(snippet);pt=//hit.getPageContent());List<Tree> forest = pt.getSentences();// we consider the snippet as a single sentence to beclassifiedif (forest.size()>0){treeBankBuffer.add(newString[]{"0|BT|"+forest.get(0).toString()+ " |ET|"});newHitListReRanked .add(hit);}}*/ProfileReaderWriter.writeReport(treeBankClassifyBuffer,path+unknownToBeClassified, ' ');tkRunner.runClassifier(path,modelFileName, classifierOutput);unknownToBeClassified,// read classification resultsList<String[]>classifResultsProfileReaderWriter.readProfiles(path+classifierOutput, ' ');=HitBase h;// iterate through classification results and set them as scores forhits//newHitList = new ArrayList<HitBase>();231for(int i=0; i < newHitList.size() && i < classifResults.size() ;i++){String scoreClassif = classifResults.get(i)[0];float val = Float.parseFloat(scoreClassif);h = newHitList.get(i);h.setGenerWithQueryScore(((double) val));newHitList.set(i, h);}// sort by SVM classification resultsCollections.sort(newHitList, new HitBaseComparable());System.out.println("\n\n================= ");=============NEWORDERfor (HitBase hit : newHitList) {if (!(onlySnippets)){System.out.println(hit.getOriginalSentences().toString()+ " => "+hit.getGenerWithQueryScore());System.out.println("pagecontent="+hit.getPageContent());}System.out.println("title = "+hit.getAbstractText());System.out.println("snippet = "+hit.getAbstractText());System.out.println("match = "+hit.getSource());}return newHitList;}protectedList<String[]>searchResultText, int count, List<HitBase> hits) {formTreeKernelStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {// if from the top of ranked docs, then positive, if from thebottom - negativeString posOrNeg = null;232if (count<=(int) (hits.size() * lower_threshold))posOrNeg=" 1 ";else if (count >= (int) (hits.size() * upper_threshold))posOrNeg=" -1 ";elseposOrNeg=" 0 ";//middle for classification// form the list of training samplesif (posOrNeg != " 0 "){// get the parses from original documents, andform the training datasetParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<Tree> forest = pt.getSentences();String[] sent = new String[1];sent[0] = posOrNeg;for(Tree t: forest){//treeBankBuffer.add(newString[]{posOrNeg+" |BT| "+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}if (sent[0] == posOrNeg){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}233protectedsearchResultText) {List<String[]>formTreeKernelClassifyStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {ParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<Tree> forest = pt.getSentences();String[] sent = new String[1];sent[0] = " 0 ";for(Tree t: forest){//treeBankBuffer.add(new String[] {" 0 " + |BT|"+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}if (sent[0] == " 0 "){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}Обучение и классификация на лесе расширенных деревьев.Пакетkernel_interface,файлMultiSentenceKernelBasedExtendedForestSearchResultsProcessor(данный класс наследует класс, описанный в предыдущем файле).public class MultiSentenceKernelBasedExtendedForestSearchResultsProcessorextends MultiSentenceKernelBasedSearchResultsProcessor{private static Logger LOG = Logger234.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor");protected TreeExtenderByAnotherLinkedTree treeExtender = newTreeExtenderByAnotherLinkedTree();protectedList<String[]>searchResultText, int count, List<HitBase> hits) {formTreeKernelStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {// if from the top of ranked docs then positive, if from thebottom - negativeString posOrNeg = null;if (count<=(int) (hits.size() * lower_threshold))posOrNeg=" 1 ";else if (count >= (int) (hits.size() * upper_threshold))posOrNeg=" -1 ";elseposOrNeg=" 0 ";//middle for classification// form the list of training samplesif (posOrNeg != " 0 "){// get the parses from original documents, andform the training datasetParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<String>treeExtender.buildForestForCorefArcs(pt);=extendedTreesDumpString[] sent = new String[1];sent[0] = posOrNeg;List<Tree> forest = pt.getSentences();for(Tree t: forest){//treeBankBuffer.add(new String[] {" 0 " +|BT| "+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}//adding trees with semantic arcsfor(String t: extendedTreesDump){235//treeBankBuffer.add(newString[]{posOrNeg+" |BT| "+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t;}if (sent[0] == posOrNeg){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}protectedsearchResultText) {List<String[]>formTreeKernelClassifyStructure(StringList<String[]> treeBankBuffer = new ArrayList<String[]> ();try {ParseThicketptmatcher.buildParseThicketFromTextWithRST(searchResultText);=List<String>treeExtender.buildForestForCorefArcs(pt);=extendedTreesDumpString[] sent = new String[1];sent[0] = " 0 ";List<Tree> forest = pt.getSentences();for(Tree t: forest){//treeBankBuffer.add(new String[] {" 0 " + |BT|"+t.toString()+ " |ET|"});sent[0] = sent[0] + " |BT| " + t.toString();}//adding trees with semantic arcs236for(String t: extendedTreesDump){sent[0] = sent[0] + " |BT| " + t;}if (sent[0] == " 0 "){sent[0] += "|BT| |ET|";}else {sent[0] += " |ET|";}treeBankBuffer.add(sent);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return treeBankBuffer;}Подготовка запросов на основе YahooAnswers.
Пакет apps, файлYahooAnswersMiner.public class YahooAnswersMiner extends BingQueryRunner{private static final Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.YahooAnswersMiner");private int page = 0;private static final int hitsPerPage = 50;public List<HitBase> runSearch(String query) {aq.setAppid(BING_KEY);aq.setQuery("site:answers.yahoo.com "+237query);aq.setPerPage(hitsPerPage);aq.setPage(page);aq.doQuery();List<HitBase> results = new ArrayList<HitBase> ();AzureSearchResultSet<AzureSearchWebResult>arsaq.getQueryResult();for (AzureSearchWebResult anr : ars){HitBase h = new HitBase();h.setAbstractText(anr.getDescription());h.setTitle(anr.getTitle());h.setUrl(anr.getUrl());results.add(h);}page++;return results;}public List<HitBase> runSearch(String query, int totalPages) {int count=0;List<HitBase> results = new ArrayList<HitBase>();while(totalPages>page*hitsPerPage){=238List<HitBase> res = runSearch(query);results.addAll(res);if (count>10)break;count++;}return results;}Приложение 7В данном приложении приведены ключевые фрагменты кода (наязыке Python), использовавшегося для реализации метода выявлениятождественных денотатов и проведения экспериментов на прикладнойонтологии и сгенерированных формальных контекстах.Генерация данных.