Диссертация (1137241), страница 25
Текст из файла (страница 25)
Пакет crawling, файлprepareQueries.public class PrepareQueries {privatestatic"Queries\\Queries900set.csv";finalStringinitialQueryFileName=private static final String queryFileName = "Queries\\queries_final.csv";privatestaticfinal"Queries\\multi_sentence_queries.csv";StringgoodQueryFileName=privatestaticfinal"Queries\\queries_short_suggestion.csv";StringshortQueryFileName=suggestQueryFileName=privatestatic"Queries\\examples2013.csv";finalStringprivatestaticfinalStringpath"D:\\Documents\\HSE\\Thickets\\Java\\Working\\Kernels_for_search\\";=210public static void ProcessQueries(){//getting product names from fileList<String[]>initialQueriesProfileReaderWriter.readProfiles(path + initialQueryFileName, ',');=//retrieving product descriptions, making them originalBingAPIProductSearchManagerBingAPIProductSearchManager ();man=new//SentenceOriginalizer orig = new SentenceOriginalizer(newString[]{"123"});SentenceOriginalizerSentenceOriginalizer("src/test/resources");orig=newList<String> query = new ArrayList<String>();String finalQuery = "";String[] arrayQuery = {};List<String[]> results = new ArrayList<String[]>();List<HitBase> res;SnippetToParagraphFull snpf = new SnippetToParagraphFull();int count = 0;int ct;for (String[] s : initialQueries){res"amazon.com","opinion", 50);=man.findProductByName(s[0],finalQuery = "";for (HitBase h : res){hsnpf.formTextFromOriginalPageGivenSnippet(h);if (!(h.getOriginalSentences().isEmpty())){query = h.getOriginalSentences();if (query.toString().length() >= 150){=211arrayQuery=newString[query.size()];ct = 0;for (String sent : query){arrayQuery[ct++] = sent;}//originalizerorig.convert(arrayQuery,s[0],s[0]);for (String sent : arrayQuery){finalQuery += sent;}results.add(arrayQuery);}}//end if}//end forif (finalQuery == ""){//query.add(s[0]);finalQuery = s[0];arrayQuery = new String[1];arrayQuery[0] = s[0];results.add(arrayQuery);}//System.out.println(query);//System.out.println("");if (count++ > 25) break;}//ProfileReaderWriter.writeReport(initialQueries,queryFileName, ',');path+212ProfileReaderWriter.writeReport(results, path + queryFileName);}public static void cleanQueries(){List<String[]>allQueriesProfileReaderWriter.readProfiles(path + queryFileName, ',');=List<String[]> goodQueries = new ArrayList<String[]>();int ct = 0;for (String[] sent : allQueries){ct = 0;for (String s : sent){//s.replace(" ", " ");//if(s.trim().matches("[\\s\\W]*Find[\\s\\W]+helpful[\\s\\W]+customer.*")){s.trim().contains("helpful")s.trim().contains("reviews") ){if&&(s.trim().contains("Find")s.trim().contains("customer")&&&&ct = 1;break;}if(s.trim().contains("Find")s.trim().contains("biggest") && s.trim().contains("selection") ){&&ct = 1;break;}}if (ct != 1) {goodQueries.add(sent);}}ProfileReaderWriter.writeReport(goodQueries,goodQueryFileName);path+}public static void makeShortQueries(){List<String[]>goodQueriesProfileReaderWriter.readProfiles(path + goodQueryFileName, ',');=213List<String[]> shortQueries = new ArrayList<String[]>();int ct = 0;String[] res = new String[1];String query = "";List<String> relSentences;for (String[] sent : goodQueries){relSentences = new ArrayList<String> ();res = new String[1];res[0] = "";query = "";for (String s : sent){try {relSentencesRelatedSentenceFinder.buildSearchEngineQueryFromSentence(s);=for (String phrase : relSentences){if (phrase.trim().startsWith("+")){res[0] += phrase;} else {query += phrase;}}} catch (Exception e) {e.printStackTrace();query += s;}}if (res[0] != ""){res[0] = String.valueOf(ct) + "!!!" + query + "!!!" +res[0];ct++;shortQueries.add(res);}}ProfileReaderWriter.writeReport(shortQueries,shortQueryFileName);path+214}public static void ProcessSuggestionQueries(){List<String[]>suggestQueriesProfileReaderWriter.readProfiles(path + suggestQueryFileName, ';');=List<String[]> goodQueries = new ArrayList<String[]>();suggestQueries.remove(0);int ct = 0;for (String[] sent : suggestQueries){String res[] = new String[1];res[0] = sent[0].trim();res[0] = res[0].replace("\"", "");res[0] = res[0].replaceFirst("[0-9]+,", "");res[0] = res[0].replaceFirst("[^,]+,", "");ct = res[0].indexOf(",http");res[0] = res[0].substring(0, ct);goodQueries.add(res);}ProfileReaderWriter.writeReport(goodQueries,goodQueryFileName);path+}Обучение и классификация на лесе регулярных деревьевразбора.Пакетkernel_interface,файлMultiSentenceKernelBasedSearchResultsProcessor.public class MultiSentenceKernelBasedSearchResultsProcessorMultiSentenceSearchResultsProcessor{extendsprivate static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedSearchResultsProcessor");privateWebSearchEngineResultsScraperWebSearchEngineResultsScraper();scraper=newprotected Matcher matcher = new Matcher();private ParseTreeChunkListScorer parseTreeChunkListScorer = newParseTreeChunkListScorer();215protected BingQueryRunnerMultipageSearchResults bingSearcher = newBingQueryRunnerMultipageSearchResults();private SnippetToParagraph snp = new SnippetToParagraph();private TreeKernelRunner tkRunner = new TreeKernelRunner();protected final float lower_threshold = (float) 0.2;protected final float upper_threshold = (float) 0.8;protected final float ratio = (float) 0.4; //соотношение обучающей итестовой выборкиprivate String path;public void setKernelPath (String path){this.path=path;}protected static final String modelFileName = "model.txt";protected static final String trainingFileName = "training.txt";protected static final String unknownToBeClassified = "unknown.txt";protected static final String classifierOutput = "classifier_output.txt";protectedstatic"\\Answers\\answers_test.csv";protectedstatic"\\Answers\\answers_learn.csv";finalfinalStringStringdetailedOutputdetailedLearningOutput==public List<HitBase> runSearchViaAPI(String query) {List<HitBase> hits = null;List<String[]> output = new ArrayList<String[]>();String[] sent;String[] fullQuery = query.split("!!!");try {List<HitBase>bingSearcher.runSearch(fullQuery[2], 100);//100resultList=216// now we apply our own relevance filter//hits = calculateMatchScoreResortHits(resultList, query);hits = resultList;//once we applied our re-ranking, we set highly ranked aspositive set, low-rated as negative set//and classify search results from the middle//training set is formed from original documents for thesearch results,// and 10 of these search results from the middle areclassified//true for snippetshits = filterOutIrrelevantHitsByTreeKernelLearning(hits,fullQuery[2], false);//true for snippets//copying results to the List<String[]>sent = new String[2];sent[1] = fullQuery[0] + " " + fullQuery[1];output.add(new String[] {""});output.add(new String[] {""});output.add(sent);sent = new String[2];sent[1] = fullQuery[2];output.add(sent);output.add(new String[] {""});for(HitBase h : hits){sent = new String[2];sent[0]h.getGenerWithQueryScore().toString() + " ";=""+output.add(sent);sent = new String[1];sent[0] = "page content = " + h.getPageContent();output.add(sent);217sent = new String[1];sent[0]="origsent="+h.getOriginalSentences().toString();output.add(sent);sent = new String[1];sent[0] = "title = " + h.getTitle();output.add(sent);sent = new String[1];sent[0] = "abstract = " + h.getAbstractText();output.add(sent);output.add(new String[] {""});}//appending results to the reportProfileReaderWriter.appendReport(output,path+detailedOutput);} catch (Exception e) {e.printStackTrace();LOG.info("No search results for query '" + fullQuery[2]);return null;}return hits;}private List<HitBase> filterOutIrrelevantHitsByTreeKernelLearning(List<HitBase> hits, String query, Boolean onlySnippets){List<HitBase> newHitList = new ArrayList<HitBase>();List<HitBase> newHitListTraining = new ArrayList<HitBase>();// form the training set from original documents.
Since searchresults are ranked, we set the top-20 as positive set,//and the bottom-20 as negative set.// after re-classification, being re-ranked, the search results mightend up in a different setList<String[]> treeBankBuffer = new ArrayList<String[]>();218List<String[]>ArrayList<String[]>();treeBankClassifyBuffer=newString snippet;Random rnd = new Random();int count = 0;int flag = 0;for (HitBase hit : hits) {count++;flag = 0;// if orig content has been already set in HIT object, ok;otherwise set itif(!(hit.getUrl().matches(".*\\.(doc[xm]|doc|DOC[XM]|DOC|jar|JAR|XLS|xls)$"))){//if (hit.getUrl().contains("ATTACHMENT01")){String searchResultText;if (!(onlySnippets)) {searchResultText = hit.getPageContent();if (searchResultText == null) {try {String[]pageSentsAndSnippet = formTextForReRankingFromHit(hit);searchResultText=pageSentsAndSnippet[0];hit.setPageContent(searchResultText);} catch (Exception e) {// skip if we are not able tofetch page contente.printStackTrace();flag = -1;} catch (NoClassDefFoundErrore1) {e1.printStackTrace();flag = -1;} catch (NoSuchMethodError e1) {// skip if we are not able tofetch page content219e1.printStackTrace();flag = -1;}}} else {//getting snippetssearchResultText = hit.getAbstractText();snippet=searchResultText.replace("<b>...</b>", ".