10h48
suite de la construction du parser global.
pour mieux utiliser les données dans le parser, il est utile de les convertir au bon format.
elles sont toutes reconnues comme texte.
les seuls autres types pouvant exister pour des données sont integer ou float.
il pourrait y avoir des booléens mais on va partir du principe qu’il n’y en a pas.
les types listes ou autre propres a python ne sont pas présents dans des fichiers de données.
la conversion des données est intéressante mais ralenti le parsing.
il me semble que malgré tout il y a un gain de temps par la suite
La conversion utilise la fonction try. qui permet e s’affranchir de pas mal de test qui sont réalisé par les fonction de conversion elle-meme. ces fonctions sont int et float.
code du convertisseur
def set_type(self, value): ''' val = x.set_type(value) value -> str val -> str, int or float convert the data to the correct data type ''' #store the data into the return variable val = value #try converting try : #try first to convert into a float val = float(value) #if succeed, try to convert into an integer val = int(value) except ValueError: #catch a converting error, either from float or int pass #return the stored value return val
et le code du test correspondant
def test_type_parsed_data(self): ''' teste si le type des donnees parsees est correcte ''' header_info_1 = "header_01" header_info_2 = "header_02" col_1 = "col_1" col_2 = "col_2" col_3 = "col_3" data_l1_c1 = "texte" data_l1_c2 = 1 data_l2_c1 = 1.5 data_l2_c2 = "1e+5" test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(type(test_item[0].get_attribute(col_1)) == type(data_l1_c1)) self.assert_(type(test_item[0].get_attribute(col_2)) == type(data_l1_c2)) self.assert_(type(test_item[1].get_attribute(col_1)) == type(data_l2_c1)) self.assert_(type(test_item[1].get_attribute(col_2)) == type(float(data_l2_c2)))
remove('tmp_file')
ce qui donne en sortie de la fonction test
..the argument is not a regular file objectfile toto doesn't existpermission denied for file tmp_parse.txt..----------------------------------------------------------------------Ran 4 tests in 0.065s
OK
11h34
le parser est bien avancé. il lui manque quelques fonctions comme l’impression
c’est facile a ajouter. l’impression sera par defaut basique en utilisant les marker et séparateur par défaut.
touts les attributs seront imprimé par défaut.
il sera cependant possible de spécifier ces paramètres
DANS Items def __str__(self, text_sep='', attr_list=[]): ''' print x x.__str__() print the data in a text format, using default options
''' #if no text separator was specified, use the default one if not text_sep : text_sep = self.text_sep #if no attribute list was specified, use the default one if exist or nothing if not attr_list : try : attr_list = self.attr_list except AttributeError : pass #if header exist add the header_mark at the top of the each line #and add all lines into the text to print try : text_list = map(lambda x: self.header_mark + x , self.header) except AttributeError : text_list = [] #if exist, add the attribute list, join by the text separator if attr_list : text_list += [join(attr_list,text_sep)] #call the print method of the item using the text separator and the attribute list as options #and add the resulting text into the text to print text_list += [item.__str__(text_sep=text_sep, attr_list=attr_list) for item in self] #join element of the text to print by a \n and return it return join(text_list, '\n')
DANS Item def __str__(self, text_sep = '\t', attr_list = []): ''' print x x.__str__() print all element of x using tabulation as default text separator options : x.__str__([text_sep, [attr_list]]) print only the values of the attributes specified in attr_list return the values as text using text_sep as text separator
''' #if no attribute list, use all the attributes of self if not attr_list : attr_list = self.keys() #get the value of the attributes in attr_list or an empty string if attribute does'nt exist text_val = [self.get(attr, '') for attr in attr_list] #return the values in a text format join by text_sep return join(map(str,text_val), text_sep)
et j’ai donc ajouté un module de test pour cette fonction
def test_printed_data(self): ''' teste si les donnees parsees sont imprimees correctement ''' header_info_1 = "header_01" header_info_2 = "header_02" col_1 = "col_1" col_2 = "col_2" data_l1_c1 = "data_l1_c1" data_l1_c2 = "data_l1_c2" data_l2_c1 = "data_l2_c1" data_l2_c2 = "data_l2_c2"
#test with a complete file test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(test_item.__str__() == test_file)
#test using a specific attribute list test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') attr_list=[col_1] test_file='''#%s#%s%s%s%s''' % (header_info_1, header_info_2, col_1, data_l1_c1, data_l2_c1) self.assert_(test_item.__str__(attr_list=attr_list) == test_file)
#test using a specific text separator test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') attr_list=[col_1] test_file='''#%s#%s%s;%s%s;%s%s;%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.assert_(test_item.__str__(text_sep=";") == test_file)
#test with no header test_file='''%s\t%s%s\t%s%s\t%s''' % (col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(test_item.__str__() == test_file)
#test with no attribute list test_file='''%s\t%s%s\t%s''' % (data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(test_item.__str__() == test_file)
#test with no data test_file='''%s\t%s''' % (col_1, col_2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(test_item.__str__() == test_file)
#test with only header test_file='''#%s#%s''' % (header_info_1, header_info_2) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(test_item.__str__() == test_file)
#test with nothing test_file='' self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') self.assert_(test_item.__str__() == test_file)
remove('tmp_file') def valid_object(self, nb_header_line, nb_col, nb_line): ''' x.valid_object(nb_header_line, nb_col, nb_line) create a tmp file with the given parameters parse the file using Items test if all data have been correctly parsed ''' #create the name of the file using the parameters (it could have been an arbitrary filename) filename = 'file_%s_%s_%s' % (nb_header_line, nb_col, nb_line) #call the method for creating the file self.create_tmp_file(filename, nb_header_line, nb_col, nb_line)
self.valid_item(filename=filename, nb_header_line=nb_header_line, nb_col=nb_col, nb_line=nb_line)
#delete te tmp file self.delete_tmp_file(filename)
qui renvoie
...the argument is not a regular file objectfile toto doesn't existpermission denied for file tmp_parse.txt..----------------------------------------------------------------------Ran 5 tests in 0.065s
OK
je vais aussi ajouter une fonction assez pratique au parser.
il s’agit d’une sorte d’index des infos qui sont dans les attributs.
je ne suis pas sur que ce soit pertinenet car ca risque d’etre lourd d’un point de vue traitement.
il faut probablement que ce soit fait a la demande.
comment procéder ?
il faut une fonction de requete.
cette fonction va créer l’index nécessaire s’il n’existe pas et rechercher dedans par la suite.
elle va donc passer par une fonction de création d’index. cette fonction var créer l’index souhaité pour le ou les attributs passé en argument.
en laissant la possibilité d’avoir plusieurs attributs en argument, on s’assure que la fonction pourra etre utilisée dans d’autre cas, comme la création de ces index de manière explicite.
si aucun argument est passé a cette fonction alors l’ensemble des index pour chaque attribut sera créé (risque d’etre long)
def create_indexes(self, attr_list=[]): ''' x.create_indexes([attr_list]) attr_list -> list of string or str create indexes for a list of attribute ''' #test if there is an argument if not attr_list : #if not try to get the default attribute list try : attr_list = self.attr_list except AttributeError : #catch the error if it does'nt exist print "no attribute for these data, indexes cannot be created" #and stop function by raising error raise #build the index for each item map(lambda x : self.build_index(x, attr_list), self)
def build_index(self, item, attr_list): ''' x.build_index(item, attr_list) item -> instance of item attr_list -> list of str or str ''' #if attr_list is a single string, convert, put it in a list if type(attr_list) == str : attr_list=[attr_list] #for each attr for attr in attr_list : #test if it's a valid attribute if attr not in self.attr_list : print "%s is not a valide attribute" % attr continue #try to access to the value associated with the key attr #if it doesn't exist, create the key as an empty dict and return it try : attr_index_list = self.indexes.setdefault(attr,{}) except AttributeError : #catch the error if the attribute indexes does'nt exist, #and create it self.indexes = {} attr_index_list = self.indexes.setdefault(attr,{}) #get the list associated with the key of the attribute of the item #or an empty list if the key does'nt already exist attr_index_item_list = attr_index_list.setdefault(item.get_attribute(attr), []) #add item into this list attr_index_item_list.append(item)
def get_items_by_value(self, attr, value): ''' item_list = x.get_items_by_value(attr, value) attr -> str value -> str, int or float item_list -> instance of Items return a list of items having a specified value for the attribute attr '''
try : #check if the attr is in the attribute list if attr not in self.attr_list : #if not raise a keyError print "%s is not a valide attribute" % attr raise IndexError, "%s is not a valide attribute" % attr except AttributeError : #catch the AttributeError if ther is no attribute list print "no attribute list available" raise #try to access to the index of the given attribute try : attr_index_list = self.indexes[attr] except AttributeError : #catch the error if the attribute indexes doesn't exist #or if the key attr does'nt exist #the index for the given attribuite is created self.create_indexes(attr) #and put into attr_index_list attr_index_list = self.indexes[attr] except KeyError : #catch the error if the attribute indexes doesn't exist #or if the key attr does'nt exist #the index for the given attribuite is created self.create_indexes(attr) #and put into attr_index_list attr_index_list = self.indexes[attr] #return an instance of items with the list of the items matching with the given value return Items(item_list = attr_index_list.get(value, []))
j’ai du coup du modifier un peu le __init__ pour qu’il accepte des liste de item pour créer l’objet. cela permet dans une fonction de recherche de renvoyer un sous ensemble de l’objet de la meme forme que l’objet (une instance de items) plutot que une simple liste.
j’ai ajouté le code de test correspondant
def test_indexed_data(self): ''' teste si les index creer sont correct ''' header_info_1 = "header_01" header_info_2 = "header_02" col_1 = "col_1" col_2 = "col_2" col_3 = "col_3" data_l1_c1 = "data_l1_c1" data_l1_c2 = "data_l1_c2" data_l2_c1 = "data_l2_c1" data_l2_c2 = "data_l2_c2" data_l2_c3 = "data_c3" data_l1_c3 = "data_c3" test_file='''#%s#%s%s\t%s\t%s%s\t%s\t%s%s\t%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, col_3, data_l1_c1, data_l1_c2, data_l1_c3, data_l2_c1, data_l2_c2, data_l2_c3) self.write_tmp_file(filetext = test_file) test_item = Items(file_arg = 'tmp_file') items_index_1 = test_item.get_items_by_value(col_1, data_l1_c1) self.assert_(isinstance(items_index_1,Items)) self.assert_(len(items_index_1)==1) self.assert_(isinstance(items_index_1[0], Item)) self.assert_(items_index_1[0].get_attribute(col_1) == data_l1_c1)
items_index_2 = test_item.get_items_by_value(col_3, data_l1_c3) self.assert_(isinstance(items_index_2,Items)) self.assert_(len(items_index_2)==2) self.assert_(isinstance(items_index_2[0], Item)) self.assert_(isinstance(items_index_2[1], Item)) self.assert_(items_index_2[0].get_attribute(col_3) == data_l1_c3) self.assert_(items_index_2[1].get_attribute(col_3) == data_l2_c3)
items_index_3 = test_item.get_items_by_value(col_1, 'toto') self.assert_(isinstance(items_index_3,Items)) self.assert_(len(items_index_3)==0) self.assertFalse(items_index_3)
self.assertRaises(IndexError, test_item.get_items_by_value, attr = 'col_0', value = 'toto')
remove('tmp_file')
le test renvoie alors
col_0 is not a valide attribute....the argument is not a regular file objectfile toto doesn't existpermission denied for file tmp_parse.txt..----------------------------------------------------------------------Ran 6 tests in 0.170s
OK
Le parser a l’air a peu pret complet, a voir a l’usage.
–> todo list tester le parser sur un plus gros fichier, type gal