Archive de la catégorie «conversion»

14 avril 2008 – ajout du module de conversion des données

avril 14, 2008

10h48
suite de la construction du parser global.
pour mieux utiliser les données dans le parser, il est utile de les convertir au bon format.
elles sont toutes reconnues comme texte.
les seuls autres types pouvant exister pour des données sont integer ou float.
il pourrait y avoir des booléens mais on va partir du principe qu’il n’y en a pas.
les types listes ou autre propres a python ne sont pas présents dans des fichiers de données.
la conversion des données est intéressante mais ralenti le parsing.
il me semble que malgré tout il y a un gain de temps par la suite
La conversion utilise la fonction try. qui permet e s’affranchir de pas mal de test qui sont réalisé par les fonction de conversion elle-meme. ces fonctions sont int et float.
code du convertisseur

 def set_type(self, value):     '''     val = x.set_type(value)     value -> str     val -> str, int or float     convert the data to the correct data type     '''     #store the data into the return variable     val = value     #try converting     try :         #try first to convert into a float         val = float(value)         #if succeed, try to convert into an integer         val = int(value)     except ValueError:         #catch a converting error, either from float or int         pass     #return the stored value     return val

et le code du test correspondant

 def test_type_parsed_data(self):     '''     teste si le type des donnees parsees est correcte     '''     header_info_1 = "header_01"     header_info_2 = "header_02"     col_1 = "col_1"     col_2 = "col_2"     col_3 = "col_3"     data_l1_c1 = "texte"     data_l1_c2 = 1     data_l2_c1 = 1.5     data_l2_c2 = "1e+5"     test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(type(test_item[0].get_attribute(col_1)) == type(data_l1_c1))     self.assert_(type(test_item[0].get_attribute(col_2)) == type(data_l1_c2))     self.assert_(type(test_item[1].get_attribute(col_1)) == type(data_l2_c1))     self.assert_(type(test_item[1].get_attribute(col_2)) == type(float(data_l2_c2)))

     remove('tmp_file')

ce qui donne en sortie de la fonction test

..the argument is not a regular file objectfile toto doesn't existpermission denied for file tmp_parse.txt..----------------------------------------------------------------------Ran 4 tests in 0.065s

OK

11h34
le parser est bien avancé. il lui manque quelques fonctions comme l’impression

c’est facile a ajouter. l’impression sera par defaut basique en utilisant les marker et séparateur par défaut.
touts les attributs seront imprimé par défaut.
il sera cependant possible de spécifier ces paramètres

DANS Items def __str__(self, text_sep='', attr_list=[]):     '''     print x  x.__str__()     print the data in a text format, using default options

     '''     #if no text separator was specified, use the default one     if not text_sep : text_sep = self.text_sep     #if no attribute list was specified, use the default one if exist or nothing     if not attr_list :         try :             attr_list = self.attr_list         except AttributeError :             pass     #if header exist add the header_mark at the top of the each line     #and add all lines into the text to print     try :         text_list = map(lambda x: self.header_mark + x , self.header)     except AttributeError :         text_list = []     #if exist, add the attribute list, join by the text separator     if attr_list : text_list += [join(attr_list,text_sep)]     #call the print method of the item using the text separator and the attribute list as options     #and add the resulting text into the text to print     text_list += [item.__str__(text_sep=text_sep, attr_list=attr_list) for item in self]     #join element of the text to print by a \n and return it     return join(text_list, '\n')

DANS Item def __str__(self, text_sep = '\t', attr_list = []):     '''     print x  x.__str__()     print all element of x using tabulation as default text separator     options :     x.__str__([text_sep, [attr_list]])     print only the values of the attributes specified in attr_list     return the values as text using text_sep as text separator

     '''     #if no attribute list, use all the attributes of self         if not attr_list : attr_list = self.keys()     #get the value of the attributes in attr_list or an empty string if attribute does'nt exist     text_val = [self.get(attr, '') for attr in attr_list]     #return the values in a text format join by text_sep     return join(map(str,text_val), text_sep)

et j’ai donc ajouté un module de test pour cette fonction

 def test_printed_data(self):     '''     teste si les donnees parsees sont imprimees correctement     '''     header_info_1 = "header_01"     header_info_2 = "header_02"     col_1 = "col_1"     col_2 = "col_2"     data_l1_c1 = "data_l1_c1"     data_l1_c2 = "data_l1_c2"     data_l2_c1 = "data_l2_c1"     data_l2_c2 = "data_l2_c2"

     #test with a complete file     test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(test_item.__str__() == test_file)

     #test using a specific attribute list     test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     attr_list=[col_1]     test_file='''#%s#%s%s%s%s''' % (header_info_1, header_info_2, col_1, data_l1_c1, data_l2_c1)     self.assert_(test_item.__str__(attr_list=attr_list) == test_file)

     #test using a specific text separator     test_file='''#%s#%s%s\t%s%s\t%s%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     attr_list=[col_1]     test_file='''#%s#%s%s;%s%s;%s%s;%s''' % (header_info_1, header_info_2, col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.assert_(test_item.__str__(text_sep=";") == test_file)

     #test with no header     test_file='''%s\t%s%s\t%s%s\t%s''' % (col_1, col_2, data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(test_item.__str__() == test_file)

     #test with no attribute list     test_file='''%s\t%s%s\t%s''' % (data_l1_c1, data_l1_c2, data_l2_c1, data_l2_c2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(test_item.__str__() == test_file)

     #test with no data     test_file='''%s\t%s''' % (col_1, col_2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(test_item.__str__() == test_file)

     #test with only header     test_file='''#%s#%s''' % (header_info_1, header_info_2)     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(test_item.__str__() == test_file)

     #test with nothing     test_file=''     self.write_tmp_file(filetext = test_file)     test_item = Items(file_arg = 'tmp_file')     self.assert_(test_item.__str__() == test_file)

     remove('tmp_file')     def valid_object(self, nb_header_line, nb_col, nb_line):     '''     x.valid_object(nb_header_line, nb_col, nb_line)     create a tmp file with the given parameters     parse the file using Items     test if all data have been correctly parsed     '''     #create the name of the file using the parameters (it could have been an arbitrary filename)     filename = 'file_%s_%s_%s' % (nb_header_line, nb_col, nb_line)     #call the method for creating the file     self.create_tmp_file(filename, nb_header_line, nb_col, nb_line)

     self.valid_item(filename=filename, nb_header_line=nb_header_line, nb_col=nb_col, nb_line=nb_line)

     #delete te tmp file     self.delete_tmp_file(filename)

qui renvoie

...the argument is not a regular file objectfile toto doesn't existpermission denied for file tmp_parse.txt..----------------------------------------------------------------------Ran 5 tests in 0.065s

OK

je vais aussi ajouter une fonction assez pratique au parser.
il s’agit d’une sorte d’index des infos qui sont dans les attributs.
je ne suis pas sur que ce soit pertinenet car ca risque d’etre lourd d’un point de vue traitement.
il faut probablement que ce soit fait a la demande.
comment procéder ?
il faut une fonction de requete.
cette fonction va créer l’index nécessaire s’il n’existe pas et rechercher dedans par la suite.
elle va donc passer par une fonction de création d’index. cette fonction var créer l’index souhaité pour le ou les attributs passé en argument.
en laissant la possibilité d’avoir plusieurs attributs en argument, on s’assure que la fonction pourra etre utilisée dans d’autre cas, comme la création de ces index de manière explicite.
si aucun argument est passé a cette fonction alors l’ensemble des index pour chaque attribut sera créé (risque d’etre long)

def create_indexes(self, attr_list=[]):       '''       x.create_indexes([attr_list])       attr_list -> list of string or str       create indexes for a list of attribute       '''       #test if there is an argument       if not attr_list :           #if not try to get the default attribute list           try :               attr_list = self.attr_list           except AttributeError :               #catch the error if it does'nt exist               print "no attribute for these data, indexes cannot be created"               #and stop function by raising error               raise       #build the index for each item       map(lambda x : self.build_index(x, attr_list), self)

   def build_index(self, item, attr_list):       '''       x.build_index(item, attr_list)       item -> instance of item       attr_list -> list of str or str       '''       #if attr_list is a single string, convert, put it in a list       if type(attr_list) == str : attr_list=[attr_list]       #for each attr       for attr in attr_list :           #test if it's a valid attribute           if attr not in self.attr_list :               print "%s is not a valide attribute" % attr               continue           #try to access to the value associated with the key attr           #if it doesn't exist, create the key as an empty dict and return it           try :               attr_index_list = self.indexes.setdefault(attr,{})           except AttributeError :               #catch the error if the attribute indexes does'nt exist,               #and create it               self.indexes = {}               attr_index_list = self.indexes.setdefault(attr,{})           #get the list associated with the key of the attribute of the item           #or an empty list if the key does'nt already exist           attr_index_item_list = attr_index_list.setdefault(item.get_attribute(attr), [])           #add item into this list           attr_index_item_list.append(item)

   def get_items_by_value(self, attr, value):       '''       item_list = x.get_items_by_value(attr, value)       attr -> str       value -> str, int or float       item_list -> instance of Items       return a list of items having a specified value for the attribute attr       '''

       try :           #check if the attr is in the attribute list           if attr not in self.attr_list :               #if not raise a keyError               print "%s is not a valide attribute" % attr               raise IndexError, "%s is not a valide attribute" % attr       except AttributeError :           #catch the AttributeError if ther is no attribute list           print "no attribute list available"           raise       #try to access to the index of the given attribute       try :           attr_index_list = self.indexes[attr]       except AttributeError :           #catch the error if the attribute indexes doesn't exist           #or if the key attr does'nt exist           #the index for the given attribuite is created           self.create_indexes(attr)           #and put into attr_index_list           attr_index_list = self.indexes[attr]       except KeyError :           #catch the error if the attribute indexes doesn't exist           #or if the key attr does'nt exist           #the index for the given attribuite is created           self.create_indexes(attr)           #and put into attr_index_list           attr_index_list = self.indexes[attr]       #return an instance of items with the list of the items matching with the given value       return Items(item_list = attr_index_list.get(value, []))   

j’ai du coup du modifier un peu le __init__ pour qu’il accepte des liste de item pour créer l’objet. cela permet dans une fonction de recherche de renvoyer un sous ensemble de l’objet de la meme forme que l’objet (une instance de items) plutot que une simple liste.
j’ai ajouté le code de test correspondant

def test_indexed_data(self):       '''       teste si les index creer sont correct       '''       header_info_1 = "header_01"       header_info_2 = "header_02"       col_1 = "col_1"       col_2 = "col_2"       col_3 = "col_3"       data_l1_c1 = "data_l1_c1"       data_l1_c2 = "data_l1_c2"       data_l2_c1 = "data_l2_c1"       data_l2_c2 = "data_l2_c2"       data_l2_c3 = "data_c3"       data_l1_c3 = "data_c3"       test_file='''#%s#%s%s\t%s\t%s%s\t%s\t%s%s\t%s\t%s''' % (header_info_1, header_info_2, col_1, col_2, col_3, data_l1_c1, data_l1_c2, data_l1_c3, data_l2_c1, data_l2_c2, data_l2_c3)       self.write_tmp_file(filetext = test_file)       test_item = Items(file_arg = 'tmp_file')       items_index_1 = test_item.get_items_by_value(col_1, data_l1_c1)       self.assert_(isinstance(items_index_1,Items))       self.assert_(len(items_index_1)==1)       self.assert_(isinstance(items_index_1[0], Item))       self.assert_(items_index_1[0].get_attribute(col_1) == data_l1_c1)

       items_index_2 = test_item.get_items_by_value(col_3, data_l1_c3)       self.assert_(isinstance(items_index_2,Items))       self.assert_(len(items_index_2)==2)       self.assert_(isinstance(items_index_2[0], Item))       self.assert_(isinstance(items_index_2[1], Item))       self.assert_(items_index_2[0].get_attribute(col_3) == data_l1_c3)       self.assert_(items_index_2[1].get_attribute(col_3) == data_l2_c3)

       items_index_3 = test_item.get_items_by_value(col_1, 'toto')       self.assert_(isinstance(items_index_3,Items))       self.assert_(len(items_index_3)==0)       self.assertFalse(items_index_3)

       self.assertRaises(IndexError, test_item.get_items_by_value, attr = 'col_0', value = 'toto')     

       remove('tmp_file')

le test renvoie alors

col_0 is not a valide attribute....the argument is not a regular file objectfile toto doesn't existpermission denied for file tmp_parse.txt..----------------------------------------------------------------------Ran 6 tests in 0.170s

OK

Le parser a l’air a peu pret complet, a voir a l’usage.
–> todo list tester le parser sur un plus gros fichier, type gal