Диссертация (1137241), страница 27
Текст из файла (страница 27)
Файл create_context.py.import random as rimport mathdef create_context(path, n = 0, m = 0, dubls = 0):if n == 0:n = r.randint(1000, 10000)if m == 0:m = r.randint(1000, 20000)if dubls == 0:dubls = r.randint(1, 1000)print('|G| - |D| = ' + str(n))print('|M| = ' + str(m))print('|D| = ' + str(dubls))f = open(path, 'w')st = ''table = []for i in range(n):attrs_num = min([int(-1 * math.log(0.1 * r.random()) + int(r.random())), m])attrs = []while len(attrs) < attrs_num:attr = round(r.random()**3 * m)if attr not in attrs:attrs.append(attr)table.append(attrs)for attr in attrs:st += 'X' + str(i) + ';M' + str(attr) + '\n'f.write(st)st = ''230for i in range(dubls):dub = r.randint(0, n)attrs = table[dub]nattrs = []for attr in attrs:if r.random() < 0.8:nattrs.append(attr)while r.random() < 0.4:attr = r.randint(0, m)if attr not in nattrs:nattrs.append(attr)table.append(nattrs)for attr in nattrs:st += 'Y' + str(i) + 'X' + str(dub) + ';M' + str(attr) + '\n'f.write(st)f.close()Интерфейс для работы с АФП.
Файл fca_lib.py.class Concept:def __init__(self, obj, attr, main_index, aux_index1 = None, aux_index2 =None):self.objects = objself.attributes = attrself.main_index = main_indexself.aux_index1 = aux_index1self.aux_index2 = aux_index2self.stability = Nonedef get_stability(self):return self.stabilityclass Concepts:def __init__(self, concepts=[]):self.concepts = sorted(concepts[:], key = lambda x: -x.main_index)231def __init__(self, path, p):concepts = []concepts_file = open(path, encoding = 'utf-8')blocks = concepts_file.read().split('\n\n')[1:-1]for block in blocks:subblocks = block.split('\n')attrs = subblocks[0].split('): ')[1].split(', ')objs = subblocks[1].split('): ')[1].split(', ')objs = [obj for obj in objs if len(obj) > 0]main_index=float(subblocks[4].split(':')[1])float(subblocks[3].split(':')[1])+concepts.append(Concept(objs, attrs, main_index))self.list = sorted(concepts, key = lambda x: -x.main_index)print('Concepts are loaded: ' + str(len(self.list)))concepts_file.close()class Context():def __init__(self, path):context_file = open(path, encoding = 'utf-8')self.obj_attr = {}self.attr_obj = {}for line in context_file:words = line.strip().split(';')obj = words[0]attr = words[1]self.obj_attr.setdefault(obj,[])self.obj_attr[obj].append(attr)self.attr_obj.setdefault(attr,[])self.attr_obj[attr].append(obj)self.objects_num = len(self.obj_attr)self.attributes_num = len(self.attr_obj)print('Context is loaded:\n\tObjects: ' + str(self.objects_num) +'\n\tAttributes: ' + str(self.attributes_num))context_file.close()p*232Подборка оптимального коэффициента.
Файл find_best_koef.py.import fca_lib as fldef calc_map(rang):rights = []k=1for c in rang.list:if len(set([int(obj.split('X')[1]) for obj in c.objects])) == 1:rights.append(k)k += 1else:rights.append(0)return sum([rights[i] / (i + 1) for i in range(len(rights))]) / len([1 for r in rights if r > 0])context_file_name = input('context:')#context_file_name = 'context10.txt'context = fl.Context(context_file_name)points = [t * 0.05 for t in range(20)] + [t**2 for t in range(1, 10)]res = []for p in points:concepts = fl.Concepts('AggregatedIndex2.txt', p)res.append([-1 * calc_map(concepts), p])for x in sorted(res):print(round(x[1],2), -1 * round(x[0], 6))233Выявление понятий, образующих тождественные денотаты.Файл select_dublicates.py.import fca_lib as fldef solve(concept):return concept.main_index > 2def control(dubls):f=0for i in range(len(dubls) - 1):for j in range(i + 1, len(dubls)):if len(dubls[i] & dubls[j]) > 0:if f == 0:print('Error! Groups have common objects:')f=1print('\t groups ' + str(i + 1) + ' and ' + str(j + 1))if f == 0:print('Errors are not found')def all_subsets(objs):for i in range(2 ** len(objs)):bins = bin(i)[2:]bins = '0' * (len(objs) - len(bins)) + binssub = [objs[i] for i in range(len(objs)) if bins[i] == '1']yield subdef galua(attrs, context):objs = set(context.obj_attr.keys())for attr in attrs:objs &= set(context.attr_obj[attr])return list(objs)def stability(concepts, context):for c in concepts.list:234if len(c.attributes) > 32:print('I can\'t do it!')breakch = 0zn = 2 ** len(c.attributes)for s in all_subsets(c.attributes):if len(galua(s, context)) == len(c.objects):ch += 1c.stability = ch * 1.0 / zncontext_file_name = input('context:')#context_file_name = 'context.txt'context = fl.Context(context_file_name)concepts = fl.Concepts('AggregatedIndex2.txt', 4)stab_concepts = fl.Concepts('AggregatedIndex2.txt', 4)stability(stab_concepts, context)stab_concepts.list.sort(key = fl.Concept.get_stability)## Analyze -----------------------------------------------------------------def classification(concepts, s = solve):dubls = []for c in concepts.list:c.aux_index1 = 1for i in range(len(concepts.list)):c = concepts.list[i]auto_solve = len([d for d in dubls if len(d & set(c.objects)) == len(c.objects)])>0auto_solve = auto_solve or c.aux_index1 == 0if auto_solve:235continueif s(c):relevant_dubls = [k for k in range(len(dubls)) if len(dubls[k] &set(c.objects)) > 0]if len(relevant_dubls) == 0:dubls.append(set(c.objects))elif len(relevant_dubls) == 1:dubls[relevant_dubls[0]] |= set(c.objects)else:new_dubls = set(c.objects)for k in relevant_dubls:new_dubls |= dubls[k]dubls = [dubls[r] for r in range(len(dubls)) if r not in relevant_dubls]dubls.append(new_dubls)else:for q in range(i + 1, len(concepts.list)):if (set(concepts.list[q].objects) & set(c.objects)) == set(c.objects):concepts.list[q].aux_index1 = 0return dublsdef ham_classification(p):pairs = []objs = [obj for obj in context.obj_attr.keys()]for i in range(len(objs) - 1):for j in range(i + 1, len(objs)):if len(set(context.obj_attr[objs[i]]) & set(context.obj_attr[objs[j]])) > p:pairs.append([objs[i], objs[j]])dubls = []for pair in pairs:relevant_dubls = [k for k in range(len(dubls)) if len(dubls[k] & set(pair)) > 0]if len(relevant_dubls) == 0:dubls.append(set(pair))elif len(relevant_dubls) == 1:dubls[relevant_dubls[0]] |= set(pair)236else:new_dubls = set(pair)for k in relevant_dubls:new_dubls |= dubls[k]dubls = [dubls[r] for r in range(len(dubls)) if r not in relevant_dubls]dubls.append(new_dubls)return dublsdef ham_classification2(p):pairs = []objs = [obj for obj in context.obj_attr.keys()]for i in range(len(objs) - 1):for j in range(i + 1, len(objs)):if (len(set(context.obj_attr[objs[i]]) | set(context.obj_attr[objs[j]])) len(set(context.obj_attr[objs[i]]) & set(context.obj_attr[objs[j]]))) < p:pairs.append([objs[i], objs[j]])dubls = []for pair in pairs:relevant_dubls = [k for k in range(len(dubls)) if len(dubls[k] & set(pair)) > 0]if len(relevant_dubls) == 0:dubls.append(set(pair))elif len(relevant_dubls) == 1:dubls[relevant_dubls[0]] |= set(pair)else:new_dubls = set(pair)for k in relevant_dubls:new_dubls |= dubls[k]dubls = [dubls[r] for r in range(len(dubls)) if r not in relevant_dubls]dubls.append(new_dubls)return dubls## 1.
Range test: our index, stabilityprint('Test 1...')237def calc_map(rang):rights = []k=1for c in rang.list:if len(set([int(obj.split('X')[1]) for obj in c.objects])) == 1:rights.append(k)k += 1else:rights.append(0)return sum([rights[i] / (i + 1) for i in range(len(rights))]) / len([1 for r in rights if r > 0])all_concepts_cnt = len(concepts.list)right_concepts = [c for c in concepts.listif len(set([int(obj.split('X')[1]) for obj in c.objects])) == 1]right_concepts_ctn = len(right_concepts)dubls_cnt = 0for obj in context.obj_attr.keys():if 'Y' in obj:dubls_cnt += 1covered_objects = set()for c in right_concepts:covered_objects |= set([obj for obj in c.objects if 'Y' in obj])covered_objects_cnt = len(covered_objects)fstat_range = 'stat_range.txt'fstat = open(fstat_range, 'a')fstat.write(str(all_concepts_cnt) + '\t' +str(right_concepts_ctn) + '\t' +str(round(calc_map(concepts), 6)) + '\t' +str(round(calc_map(stab_concepts), 6)) + '\t' +str(dubls_cnt) + '\t' +238str(covered_objects_cnt) + '\t' +str(round(covered_objects_cnt / dubls_cnt, 6)) + '\n')fstat.close()## 2.
Classification test: our index, stability, Hammingprint('Test 2...')dubls_num = 0objs_num = 0for obj in context.obj_attr.keys():if 'Y' in obj:dubls_num += 1else:objs_num += 1def classif_quality(dubls):dubls_found = 0error_dubls = 0links_num = 0if len(dubls) == 0:return 0, 1for d in dubls:originals = [int(obj.split('X')[1]) for obj in d]dubls_found += len(originals) - len(set(originals))error_dubls += len(set(originals)) - 1links_num += len(originals) - 1return dubls_found / dubls_num, dubls_found / links_numstab_points = [0.999999 - (t/20) for t in range(20)]our_points = [6 - (t/5) for t in range(20)]ham2_points = [t + 0.5 for t in range(11)]ham_points = [11.5 - t for t in range(11)]stab_cl_quality = []239our_cl_quality =[]ham_cl_quality = []ham2_cl_quality = []print(' Stability...')for p in stab_points:dubls = classification(stab_concepts, lambda x: x.stability > p)stab_cl_quality.append(classif_quality(dubls))print(' Our index...')for p in our_points:dubls = classification(concepts, lambda x: x.main_index > p)our_cl_quality.append(classif_quality(dubls))print(' Hamming...')for p in ham_points:dubls = ham_classification(p)ham_cl_quality.append(classif_quality(dubls))print(' Hamming2...')for p in ham2_points:dubls = ham_classification2(p)ham2_cl_quality.append(classif_quality(dubls))##fres = open('bublicates-' + context_file_name, 'w')##print('finish!!!\n\nDublicates (' + str(len(dubls)) + '):')##control(dubls)##k = 0##for d in dubls:## k += 1## #print(d)## st = 'Dublicates group ' + str(k) + ':'## for obj in d:##st += ' ' + str(obj)## fres.write(st + '\n')##fstat = open('quality_stats.txt', 'a')240dubls_num = 0objs_num = 0for obj in context.obj_attr.keys():if 'Y' in obj:dubls_num += 1else:objs_num += 1st = str(objs_num) + '\t' + str(dubls_num) + '\n'st1, st2 = '', ''for i in range(len(our_cl_quality)):st1 += str(round(our_cl_quality[i][0], 6)) + '\t'st2 += str(round(our_cl_quality[i][1], 6)) + '\t'st += st1[:-1] + '\n' + st2[:-1] + '\n'st1, st2 = '', ''for i in range(len(stab_cl_quality)):st1 += str(round(stab_cl_quality[i][0], 6)) + '\t'st2 += str(round(stab_cl_quality[i][1], 6)) + '\t'st += st1[:-1] + '\n' + st2[:-1] + '\n'st1 = ''st2 = ''for i in range(len(ham_cl_quality)):st1 += str(round(ham_cl_quality[i][0], 6)) + '\t'st2 += str(round(ham_cl_quality[i][1], 6)) + '\t'st += st1[:-1] + '\n' + st2[:-1] + '\n'st1 = ''st2 = ''for i in range(len(ham2_cl_quality)):st1 += str(round(ham2_cl_quality[i][0], 6)) + '\t'st2 += str(round(ham2_cl_quality[i][1], 6)) + '\t'st += st1[:-1] + '\n' + st2[:-1] + '\n'fstat.write(st)fstat.close()241##fstat.write(str(context.objects_num - dubls_num) + '\t' +##str(dubls_num) + '\t' +##str(dubls_found) + '\t' +##str(round(dubls_found / dubls_num, 6)) + '\t' +##str(round(dubls_found / links_num, 6)) + '\t' +##str(error_dubls) + '\t' +##str(error_dubls / (context.objects_num - dubls_num)) + '\n')##fstat.close().