диссертация (1169444), страница 20
Текст из файла (страница 20)
ПО «Социолингвистический анализатор Британскогонационального корпуса BNC»BNCpath = 'D:/BNCcorp/'# имя выходного файла (по умолчанию запись в .xls)output_file_name = 'name'# Селектор диалектов (список указывается в кавычках через запятую)dialect_selector = (dialect_types)# Селектор типов слов (список указывается в кавычках через запятую)word_type_selector = (word_types)# id файла устанавливается вручную из файла-справки корпуса BNC.# Идентификатор ‗all‘ снимает какие-либо ограниченияfile_id = ('all‘)Medium = ('all')Domain = (‗all‘)GENRE= ('all')Aud_Age = ('all')Aud_Sex = ('all')Aud_Level = ('all')Sampling = ('all')Circulation_Status = ('all')Interaction_Type = ('all')Time_Period = ('all')Mode = ('all')Author_Age = ('all')179Author_Sex = ('all')Author_Type = ('all')############ Кодобработки ##############field_names = {'Medium':'B', 'Domain':'C', 'GENRE':'D', 'Aud_Age':'H', 'Aud_Sex':'I','Aud_Level':'J', 'Sampling':'L', 'Circulation_Status':'N', 'Interaction_Type':'O','Time_Period':'P', 'Mode':'Q', 'Author_Age':'R', 'Author_Sex':'S', 'Author_Type':'T'}# функцияобратнаякcolnamedefcolindex(colname):alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"k = alphabet.find(colname[len(colname)-1].upper())fori in range(len(colname)):k = k + (alphabet.find(colname[i].upper())+1)*(26*(len(colname)-i-1))return k#получаеминдексфайловindexXML = ET.parse(BNCpath + 'Etc/file_index.xml') #(file_in)index_root = indexXML.getroot()tmp_index = {ir.text[5:8]:ir.text for ir in index_root.iter('file')}index = {}iffile_id != ('all'):fori in tmp_index:ifi in file_id:index[i] = tmp_index[i]else: index = tmp_index#чтение базы источниковrb = xlrd.open_workbook(paths.BNCpath +'BNC_WORLD_INDEX.XLS',formatting_info=True)180sheet = rb.sheet_by_index(0)# Подмена метазаполнения на возможные значенияifMedium == ('all'):Medium = ('m_pub',\'periodical',\'book',\'m_unpub',\'---',\'to_be_spoken')if Domain == ('all'):Domain = ('W_soc_science',\'W_world_affairs',\'W_arts',\'W_belief_thought',\'W_imaginative',\'W_leisure',\'W_nat_science',\'W_commerce',\'W_app_science' ,\'S_cg_public_instit',\'S_cg_business',\'S_cg_education','S_cg_leisure', \'---', \'S_Demog_AB', \181'S_Demog_DE', \'S_Demog_C2', \'S_Demog_C1', \'S_Demog_Unclassified')if GENRE == ('all'):GENRE = ('W_non_ac_medicine',\'W_institut_doc',\'W_pop_lore',\'W_ac_humanities_arts',\'W_non_ac_humanities_arts',\'W_fict_prose',\'W_misc',\'W_ac_soc_science',\'W_biography',\'W_non_ac_soc_science',\'W_instructional',\'W_non_ac_tech_engin',\'W_newsp_brdsht_nat_arts',\'W_newsp_brdsht_nat_commerce',\'W_newsp_brdsht_nat_editorial',\'W_newsp_brdsht_nat_report',\'W_newsp_brdsht_nat_misc',\'W_newsp_brdsht_nat_social',\'W_newsp_brdsht_nat_science',\'W_newsp_brdsht_nat_sports',\182'W_ac_polit_law_edu',\'W_commerce',\'W_non_ac_polit_law_edu',\'W_non_ac_nat_science',\'W_religion',\'W_advert',\'W_ac_nat_science',\'W_letters_prof',\'W_newsp_other_report',\'W_ac_medicine',\'W_fict_poetry',\'W_ac_tech_engin',\'W_newsp_other_social',\'W_newsp_other_commerce',\'W_newsp_other_sports',\'W_newsp_tabloid',\'S_speech_scripted',\'S_pub_debate',\'S_meeting',\'S_speech_unscripted',\'W_admin',\'S_classroom',\'S_unclassified',\'S_demonstratn',\'S_courtroom',\183'S_interview_oral_history',\'S_lect_soc_science',\'S_lect_nat_science',\'S_tutorial',\'S_lect_humanities_arts',\'S_brdcast_discussn',\'S_sermon',\'S_consult',\'W_fict_drama',\'W_hansard',\'S_interview',\'W_letters_personal',\'W_essay_univ',\'W_essay_school',\'---',\'S_brdcast_documentary',\'S_sportslive',\'S_brdcast_news',\'S_lect_polit_law_edu',\'S_lect_commerce',\'W_email',\'W_news_script',\'S_parliament',\'W_newsp_other_science',\'W_newsp_other_arts',\184'S_conv')ifAud_Age == ('all'):Aud_Age=('adult',\'teen',\'child',\'---')ifAud_Sex == ('all'):Aud_Sex=('mixed',\'male',\'female',\'---')ifAud_Level == ('all'):Aud_Level = ('med',\'low',\'high',\'---')if Sampling == ('all'):Sampling = ('cmp',\'whl',\'beg',\'mid',\'--',\'end')ifCirculation_Status == ('all'):Circulation_Status = ('M',\185'L',\'H',\'-')ifInteraction_Type == ('all'):Interaction_Type = ('---',\'Dialogue',\'Monologue')ifTime_Period == ('all'):Time_Period = ('1985-1994',\'---',\'1975-1984',\'1960-1974')if Mode == ('all'):Mode = ('W','S')ifAuthor_Age == ('all'):Author_Age = ('---',\'45-59',\'15-24',\'25-34',\'35-44',\'60+ yrs',\'0-14')ifAuthor_Sex == ('all'):Author_Sex = ('---',\'Mixed',\186'Male',\'Female',\'Unknown')ifAuthor_Type == ('all'):Author_Type = ('Multiple',\'Corporate',\'Sole',\'Unknown',\'---')#формируем индекс описания источниковSourceRowIndex = {}forrownum in range(sheet.nrows):SourceRowIndex[sheet.cell(rownum,0).value.encode('ascii','ignore')] = rownum+1# создаем выходной файл и листы в нем#('UU','DE','C1','AB','C2')wb = xlwt.Workbook()UU = wb.add_sheet('UU')DE = wb.add_sheet('DE')C1 = wb.add_sheet('C1')AB = wb.add_sheet('AB')C2 = wb.add_sheet('C2')meta = wb.add_sheet('meta')# функция возвращающая значение по паре в нотации самого Excel (заодно сразуперевожу в str)defGetItem(stri,intg):return sheet.cell(intg-1,colindex(stri)).value#.encode('ascii','ignore')187#инкапсуляцияпроверкиполучениянодыdefGetTextFind(host_el, key, akey):try:returnhost_el[key].find(akey).textexceptKeyError:return ''exceptAttributeError:return ''defGetAttr(host_el, key, akey):try:returnhost_el[key].attrib[akey]exceptKeyError:return ''defwriteRow(sheet, row, vec):fori in range(len(vec)):sheet.write(row, i, vec[i])defintegerate(s):try:returnint(s)exceptValueError:return sPersonDict[who].find('age').text ; PersonDict[who].find('persName').text ;PersonDict[who].find('occupation').text ; PersonDict[who].find('dialect').text ; f ; title")#переменные хода прогрессаprogress = 0sp = ' ; '188num = 0# счетчик строкRowNumIndex = {'UU':1,'DE':1,'C1':1,'AB':1,'C2':1}vec = ['left Context',\'Abstr Noun',\'right Context',\'c5', \'pos',\'sex',\'role',\'social class',\'dialect',\'occupation',\'age',\'name',\'dialect (code)',\'file name',\'GENRE',\'Notes & Alternative Genres',\'Interaction Type',\'Time Period (Alltim)',\'sentencenum',\'Domain',\'Word Total (in source)',\'source title'\189]writeRow(UU, 0, vec)writeRow(DE, 0, vec)writeRow(C1, 0, vec)writeRow(AB, 0, vec)writeRow(C2, 0, vec)# проверка всех файлов на предмет соответствия заданным характеристикамfor f in index:progress = progress + 1RowNum = SourceRowIndex[f]print (str ((len(index) - progress))+'-->'+str(RowNum)+' |=| '+str(f) + ' ЗАПИСЕЙ: '+str(num))# Условияотбораfield_names['Mode']if \Medium and\GetItem(field_names['Domain'],RowNum) in Domain and\GetItem(field_names['GENRE'],RowNum) in GENRE and\GetItem(field_names['Aud_Age'],RowNum) in Aud_Age and\GetItem(field_names['Aud_Sex'],RowNum) in Aud_Sex and\GetItem(field_names['Aud_Level'],RowNum) in Aud_Level and\GetItem(field_names['Sampling'],RowNum) in Sampling and\GetItem(field_names['Bibliographical_Details'],RowNum) in Bibliographical_Detailsand\GetItem(field_names['Circulation_Status'],RowNum) in Circulation_Status and\GetItem(field_names['Interaction_Type'],RowNum) in Interaction_Type and\GetItem(field_names['Time_Period'],RowNum) in Time_Period and\GetItem(field_names['Mode'],RowNum) in Mode and\190GetItem(field_names['Author_Age'],RowNum) in Author_Age and\GetItem(field_names['Author_Sex'],RowNum) in Author_Sex and\GetItem(field_names['Author_Type'],RowNum) in Author_Type:tree = ET.parse(paths.BNCpath + 'Texts/' + index[f]) # нужныйфайлнайден,начинаемегоанализировать, ирасчленятьroot = tree.getroot()#вычленяем хедер и данные из негоtitle = tree.find('.//title').text#анализаторперсонPersonDict = {}forpers in tree.findall('.//person'):PersonDict[pers.attrib['{http://www.w3.org/XML/1998/namespace}id']] = pers# основное, говорящий атрибуты, говорящий ноды, имя источника, заголовоккнигиfor u in root.findall('.//u'):who = u.attrib['who']ifGetAttr(PersonDict,who,'dialect') in dialect_selector:for s in u.findall('s'): #s = root.find('.//s')n = s.attrib['n']leftContext = ['']rightContext = ['']b = [True]# идентификатор до или после.flag = [1]AN = []c5 = []hw = []191pos = []i = 0 # идентификатор количества найденых существительныхfor w ins.findall('.//'): # все содержимое блока.flag[i-1] = 1word = ''ifw.tag == 'c':word = w.textifw.tag == 'w':word = w.textifw.attrib['c5'] in word_type_selector:AN.append(w.text)c5.append(w.attrib['c5'])hw.append(w.attrib['hw'])pos.append(w.attrib['pos'])leftContext.append(leftContext[i])rightContext.append('')b[i] = Falseflag[i] = 0b.append(True)flag.append(1)i=i+1forik in range(i+1):if b[ik]:leftContext[ik] = leftContext[ik] + wordelse:192rightContext[ik] = rightContext[ik] + word*flag[ik]#Аутпутим в цикле по range(i)for k in range(i):num = num + 1soc = GetAttr(PersonDict,who,'soc')vec = [leftContext[k],\AN[k],\rightContext[k], \c5[k], \pos[k], \GetAttr(PersonDict,who,'sex'), \GetAttr(PersonDict,who,'role'), \GetAttr(PersonDict,who,'soc'), \GetTextFind(PersonDict,who,'dialect'), \GetTextFind(PersonDict,who,'occupation'), \integerate(GetTextFind(PersonDict,who,'age')), \GetTextFind(PersonDict,who,'persName'),\GetAttr(PersonDict,who,'dialect'), \f, \GetItem('D',RowNum),\GetItem('E',RowNum),\GetItem('O',RowNum),\GetItem('P',RowNum),\integerate(n), \GetItem('C',RowNum),\193integerate(GetItem('K',RowNum)),\title ]#('UU','DE','C1','AB','C2')ifsoc == 'UU':writeRow(UU, RowNumIndex['UU'], vec)RowNumIndex['UU'] = RowNumIndex['UU'] + 1elifsoc == 'DE':writeRow(DE, RowNumIndex['DE'], vec)RowNumIndex['DE'] = RowNumIndex['DE'] + 1elifsoc == 'C1':writeRow(C1, RowNumIndex['C1'], vec)RowNumIndex['C1'] = RowNumIndex['C1'] + 1elifsoc == 'AB':writeRow(AB, RowNumIndex['AB'], vec)RowNumIndex['AB'] = RowNumIndex['AB'] + 1elifsoc == 'C2':writeRow(C2, RowNumIndex['C2'], vec)RowNumIndex['C2'] = RowNumIndex['C2'] + 1else:passmeta.write(0, 0, 'records')meta.write(0, 1, num)i=1for key in RowNumIndex:194meta.write(i, 0, key)#print (str(i)+','+key)meta.write(i, 1, RowNumIndex[key]-1)#print (str(i)+','+str(RowNumIndex[key]))i = i+1wb.save(paths.BNCpath + Output_file_name +'.xls')print(num)195СПИСОК ИСПОЛЬЗОВАННОЙ ЛИТЕРАТУРЫ1.