{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"FineTuningTechniques.ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"widgets":{"application/vnd.jupyter.widget-state+json":{"eff23e86f29b458090212600b5e63563":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_f5eea29c8af148e6b9ecf585885aa0b2","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_2134bca9c6f742089a253f7fb58525e8","IPY_MODEL_f9de162dc6e14139b177c93fac3a0b01"]}},"f5eea29c8af148e6b9ecf585885aa0b2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"2134bca9c6f742089a253f7fb58525e8":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_74ad646f5e2a48eb9d3d0e363b71d7ed","_dom_classes":[],"description":"Optimization Progress: 100%","_model_name":"FloatProgressModel","bar_style":"","max":84,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":84,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_7730fde597a24584acc0d1668ae9ba71"}},"f9de162dc6e14139b177c93fac3a0b01":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_7f435f16c67545b384a74c29c02de8ea","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 84/84 [12:40<00:00, 10.14s/pipeline]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_2ce11d9acb4d42fd82c6215b0fc429da"}},"74ad646f5e2a48eb9d3d0e363b71d7ed":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"7730fde597a24584acc0d1668ae9ba71":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"7f435f16c67545b384a74c29c02de8ea":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"2ce11d9acb4d42fd82c6215b0fc429da":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}}}}},"cells":[{"cell_type":"markdown","metadata":{"id":"tLmdTK71BuHf"},"source":["## Μετασχηματιστές και Εκτιμητές στο scikit-learn\n","\n","Οι μετασχηματιστές έχουν δύο βασικές μεθόδους, την fit και την transform. Με την fit μαθαίνουν κάποιες παραμέτρους με βάση τα δεδομένα train και με την transform μπορούν να μετασχηματίσουν τα δεδομένα (train ή test) βάσει των παραμέτρων που έχουν μάθει.\n","![alt text](https://i.imgur.com/uqtJyI8.jpg)\n","\n","Η δεύτερη μεγάλη κλάση του scikit learn είναι οι εκτιμητές (**estimators**). Αυτό που τους διαφοροποιεί από τους μετασχηματιστές είναι ότι ενώ έχουν τις δικές τους μεθόδους fit και σε κάποιες περιπτώσεις και transform, έχουν επιπλέον τη μέθοδο predict, που κάνει τις προβλέψεις πάνω στα δεδομένα του test set. Οι ταξινομητές στην επιβλεπόμενη μάθηση είναι λοιπόν όλοι τους εκτιμητές, εφόσον κάνουν fit πάνω στα δεδομένα train και predict στα δεδομένα test.\n","\n","![alt text](https://i.imgur.com/jhExL9i.jpg)\n","\n","Θα δούμε στη συνέχεια ότι μπορούμε να σχηματίζουμε αλυσίδες πολλών διαδοχικών μετασχηματιστών που καταλήγουν σε έναν εκτιμητή, τα λεγόμενα pipelines."]},{"cell_type":"markdown","metadata":{"id":"-Z0Nf6HMCPAw"},"source":["## Pipelines\n","\n","Μια διαδικασία ML ή workflow ή pipeline αποτελείται από μια σειρά μετασχηματιστών πάνω στα χαρακτηριστικά των δεδομένων και καταλήγει σε έναν εκτιμητή.\n","\n","Οι μετασχηματιστές χρησιμοποιούνται για να κάνουν την προεπεξεργασία (μέσω μετασχηματισμού) των δεδομένων. \n","\n","Στο προηγούμενο εργαστήριο είδαμε κάποιες μεθόδους μείωσης διαστάσεων και επιλογής χαρακτηριστικών. Και οι δύο αυτές μέθοδοι τείνουν να μειώσουν τον αριθμό των χαρακτηριστικών στο σύνολο δεδομένων.Οι μέθοδοι μείωσης διαστάσεων το κάνουν δημιουργώντας νέους συνδυασμούς χαρακτηριστικών (μερικές φορές γνωστές ως μετασχηματισμός χαρακτηριστικών), ενώ οι μέθοδοι επιλογής χαρακτηριστικών περιλαμβάνουν και εξαιρούν χαρακτηριστικά που υπάρχουν στα δεδομένα χωρίς να τα αλλάζουν.\n","\n","Οι μετασχηματιστές γενικά έχουν και αυτοί υπερ-παραμέτρους που επηρεάζουν τη λειτουργία τους π.χ.ο VarianceThreshold είχε το κατώτερο κατώφλι διακύμανσης ο PCA τον αριθμό των κύριων συνιστωσών\n","\n","Η επιλογή των υπερ-παραμέτρων γίνεται μόνο εμπειρικά και μέσω της διασταυρούμενης επικύρωσης (cross-validation) μπορούμε να λάβουμε καλύτερα αποτελέσματα.\n"]},{"cell_type":"markdown","metadata":{"id":"4rHG4TEj9rZP"},"source":["\n",""]},{"cell_type":"markdown","metadata":{"id":"jItBKAnXzYHT"},"source":["Έτσι, για να βελτιώσουμε τη απόδοση του μοντέλου μας, χρησιμοποιούμε Διασταυρούμενη Επικύρωσης (Cross-Validation) στο σετ εκπαίδευσης.\n","\n","Μία από τις πιο κοινές μεθόδους Cross-Validation είναι η K-Fold Validation. \n","\n","Στο K-Fold, διαιρούμε το σύνολο δεδομένων μας σε Ν τμήματα και στη συνέχεια εκπαιδεύουμε επαναληπτικά το μοντέλο μας χρησιμοποιώντας Ν-1 τμήματα και το ελέγχουμε χρησιμοποιώντας το αριστερό τμήμα (σε κάθε επανάληψη αλλάζουμε το αριστερό τμήμα). Αφού εκπαιδεύσουμε N φορές το μοντέλο μας, μετράμε στη συνέχεια τα αποτελέσματα της εκπαίδευσης που λαμβάνονται σε κάθε επανάληψη για να λάβουμε τα συνολικά αποτελέσματα της εκπαίδευσής μας.\n","\n","\n"]},{"cell_type":"markdown","metadata":{"id":"xGwkzhG9LqDx"},"source":["Η χρήση διασταυρούμενης επικύρωσης κατά την εφαρμογή της βελτιστοποίησης υπερπαραμέτρων μπορεί να είναι πολύ σημαντική. Με αυτόν τον τρόπο, ενδέχεται να αποφύγουμε τη χρήση ορισμένων υπερπαραμέτρων που λειτουργούν πολύ καλά στα δεδομένα εκπαίδευσης αλλά όχι τόσο καλά με τα δεδομένα δοκιμής."]},{"cell_type":"markdown","metadata":{"id":"1W11fNsVKSF_"},"source":["\n","# Παράμετροι μοντέλων μηχανικής μάθησης\n","Τα μοντέλα μηχανικής μάθησης απαρτίζονται από δύο διαφορετικούς τύπους παραμέτρων:\n","\n","* **Υπερπαράμετροι (Hyperparameters)**\n","\n","Είναι όλες οι παράμετροι που μπορούν να ρυθμιστούν αυθαίρετα από το χρήστη πριν ξεκινήσει η εκπαίδευση του μοντέλου (π.χ. αριθμός εκτιμητών στο Random Forest) και καθορίζουν τον τρόπο δομής του μοντέλου μας.\n","\n","* **Παράμετροι μοντέλου (Model parameters)**\n","\n","Μαθαίνονται κατά τη διάρκεια της εκπαίδευσης του μοντέλου (π.χ. βάρη στα νευρωνικά δίκτυα, κτλ) και καθορίζουν τον τρόπο χρήσης των δεδομένων εισόδου ώστε να ληφθεί η επιθυμητή έξοδο και μαθαίνονται κατά την εκπαίδευση. \n","\n","H ταυτόχρονη ρύθμιση (tuning) των υπερπαραμέτρων καθώς και των παραμέτρων των μοντέλων είναι ένας τύπος προβλήματος βελτιστοποίησης. \n","\n","Έχουμε ένα σύνολο υπερπαραμέτρων και στοχεύουμε να βρούμε το σωστό συνδυασμό των τιμών τους που θα μας βοηθήσουν να βρούμε είτε την ελάχιστη (π.χ. απώλεια) είτε τη μέγιστη (π.χ. ακρίβεια) μιας συνάρτησης.\n","\n","Αυτό μπορεί να είναι ιδιαίτερα σημαντικό κατά τη σύγκριση της απόδοσης διαφορετικών μοντέλων μηχανικής μάθησης σε ένα σύνολο δεδομένων.\n","\n","## Προσεγγίσεις για τη βελτιστοποίηση υπερπαραμέτρων (Hyperparameter optimization)\n","\n","* Χειροκίνητη αναζήτηση (Manual Search)\n","* Τυχαία αναζήτηση (Random Search)\n","* Αναζήτηση πλέγματος (Grid Search)\n","* Αυτόματη ρύθμιση υπερπαραμέτρων (Bayesian Optimization, Genetic Algorithms)\n","* Ρύθμιση Artificial Neural Networks (ANNs) \n","\n"]},{"cell_type":"markdown","metadata":{"id":"HSCQvGFsLyDj"},"source":["Το σύνολο δεδομένων που θα χρησιμοποιήσουμε είναι το Kaggle Credit Card Fraud Detection.\n","\n","Στόχος μας είναι να ταξινομήσουμε σωστά τις συναλλαγές με πιστωτική κάρτα σε γνήσιες και μη γνήσιες (δυαδική ταξινόμηση). Θα χρησιμοποιήσουμε έναν περιορισμένο αριθμό χαρακτηριστικών από αυτό το σύνολο."]},{"cell_type":"code","metadata":{"id":"yxvJ_hLMh5jS"},"source":["import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","from matplotlib.pyplot import figure\n","import numpy as np\n","import seaborn as sns\n","\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import classification_report,confusion_matrix\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import accuracy_score\n","from sklearn.model_selection import cross_val_score"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"2n67pbgkqBTn","colab":{"base_uri":"https://localhost:8080/","height":217},"executionInfo":{"status":"ok","timestamp":1604997747746,"user_tz":-120,"elapsed":11273,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"93229c01-c4a3-4cbb-8891-4c88893e28f8"},"source":["raw_df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')\n","raw_df.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
\n","
"],"text/plain":[" Time V1 V2 V3 ... V27 V28 Amount Class\n","0 0.0 -1.359807 -0.072781 2.536347 ... 0.133558 -0.021053 149.62 0\n","1 0.0 1.191857 0.266151 0.166480 ... -0.008983 0.014724 2.69 0\n","2 1.0 -1.358354 -1.340163 1.773209 ... -0.055353 -0.059752 378.66 0\n","3 1.0 -0.966272 -0.185226 1.792993 ... 0.062723 0.061458 123.50 0\n","4 2.0 -1.158233 0.877737 1.548718 ... 0.219422 0.215153 69.99 0\n","\n","[5 rows x 31 columns]"]},"metadata":{"tags":[]},"execution_count":2}]},{"cell_type":"code","metadata":{"id":"JfGtSCwxmRDN","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997747749,"user_tz":-120,"elapsed":11269,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"0c8b79e2-4c93-48fe-c2f1-fe61256bf0f2"},"source":["print(raw_df.shape)\n","print(raw_df.columns)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["(284807, 31)\n","Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\n"," 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',\n"," 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',\n"," 'Class'],\n"," dtype='object')\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"OhAFL1I7sl-1","colab":{"base_uri":"https://localhost:8080/","height":977},"executionInfo":{"status":"ok","timestamp":1604997747750,"user_tz":-120,"elapsed":11262,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"2e336b69-7715-4a54-e2ad-f8bd235b99aa"},"source":["percent_missing = raw_df.isnull().sum() * 100 / len(raw_df)\n","missing_values = pd.DataFrame({'percent_missing': percent_missing}) \n","missing_values.sort_values(by ='percent_missing' , ascending=False)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
percent_missing
Time0.0
V160.0
Amount0.0
V280.0
V270.0
V260.0
V250.0
V240.0
V230.0
V220.0
V210.0
V200.0
V190.0
V180.0
V170.0
V150.0
V10.0
V140.0
V130.0
V120.0
V110.0
V100.0
V90.0
V80.0
V70.0
V60.0
V50.0
V40.0
V30.0
V20.0
Class0.0
\n","
"],"text/plain":[" percent_missing\n","Time 0.0\n","V16 0.0\n","Amount 0.0\n","V28 0.0\n","V27 0.0\n","V26 0.0\n","V25 0.0\n","V24 0.0\n","V23 0.0\n","V22 0.0\n","V21 0.0\n","V20 0.0\n","V19 0.0\n","V18 0.0\n","V17 0.0\n","V15 0.0\n","V1 0.0\n","V14 0.0\n","V13 0.0\n","V12 0.0\n","V11 0.0\n","V10 0.0\n","V9 0.0\n","V8 0.0\n","V7 0.0\n","V6 0.0\n","V5 0.0\n","V4 0.0\n","V3 0.0\n","V2 0.0\n","Class 0.0"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"id":"FWbSI9pPs6r6","colab":{"base_uri":"https://localhost:8080/","height":588},"executionInfo":{"status":"ok","timestamp":1604997749392,"user_tz":-120,"elapsed":12894,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"d05e0967-0454-47d3-d0fe-c8b083dbe779"},"source":["figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k')\n","\n","corr=raw_df.corr()\n","sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{"tags":[]},"execution_count":5},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["
"]},"metadata":{"tags":[]}}]},{"cell_type":"code","metadata":{"id":"WTT_e0tutKi5","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997749394,"user_tz":-120,"elapsed":12889,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"d9e87007-99d9-4f6c-d950-0955c830b069"},"source":["#Κρατάμε ένα τμήμα του αρχικού dataset πάνω στο οποίο θα δουλέψουμε \n","df2 = raw_df[raw_df.Class == 1][0:450]\n","print(df2.shape)\n","df3 = raw_df[raw_df.Class == 0][0:450]\n","print(df3.shape)\n","\n","df = df2.append(df3, ignore_index=True)\n","#df4.head()\n","df.shape"],"execution_count":null,"outputs":[{"output_type":"stream","text":["(450, 31)\n","(450, 31)\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["(900, 31)"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"code","metadata":{"id":"Fm8iF2w9tKpR","colab":{"base_uri":"https://localhost:8080/","height":217},"executionInfo":{"status":"ok","timestamp":1604997749396,"user_tz":-120,"elapsed":12884,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"222aab25-4e71-46b5-cab1-55db0341301b"},"source":["df.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
0406.0-2.3122271.951992-1.6098513.997906-0.522188-1.426545-2.5373871.391657-2.770089-2.7722723.202033-2.899907-0.595222-4.2892540.389724-1.140747-2.830056-0.0168220.4169560.1269110.517232-0.035049-0.4652110.3201980.0445190.1778400.261145-0.1432760.001
1472.0-3.043541-3.1573071.0884632.2886441.359805-1.0648230.325574-0.067794-0.270953-0.838587-0.414575-0.5031410.676502-1.6920292.0006350.6667800.5997171.7253210.2833452.1023390.6616960.4354771.375966-0.2938030.279798-0.145362-0.2527730.035764529.001
24462.0-2.3033501.759247-0.3597452.330243-0.821628-0.0757880.562320-0.399147-0.238253-1.5254122.032912-6.5601240.022937-1.470102-0.698826-2.282194-4.781831-2.615665-1.334441-0.430022-0.294166-0.9323910.172726-0.087330-0.156114-0.5426280.039566-0.153029239.931
36986.0-4.3979741.358367-2.5928442.679787-1.128131-1.706536-3.496197-0.248778-0.247768-4.8016374.895844-10.9128190.184372-6.771097-0.007326-7.358083-12.598419-5.1315490.308334-0.1716080.5735740.176968-0.436207-0.0535020.252405-0.657488-0.8271360.84957359.001
47519.01.2342353.019740-4.3045974.7327953.624201-1.3577461.713445-0.496358-1.282858-2.4474692.101344-4.6096281.464378-6.079337-0.3392372.5818516.7393843.042493-2.7218530.009061-0.379068-0.704181-0.656805-1.6326531.4889010.566797-0.0100160.1467931.001
\n","
"],"text/plain":[" Time V1 V2 V3 ... V27 V28 Amount Class\n","0 406.0 -2.312227 1.951992 -1.609851 ... 0.261145 -0.143276 0.00 1\n","1 472.0 -3.043541 -3.157307 1.088463 ... -0.252773 0.035764 529.00 1\n","2 4462.0 -2.303350 1.759247 -0.359745 ... 0.039566 -0.153029 239.93 1\n","3 6986.0 -4.397974 1.358367 -2.592844 ... -0.827136 0.849573 59.00 1\n","4 7519.0 1.234235 3.019740 -4.304597 ... -0.010016 0.146793 1.00 1\n","\n","[5 rows x 31 columns]"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"6HmKOBJ1mPaE","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997749397,"user_tz":-120,"elapsed":12877,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"6cbc7d2d-7bd7-4b29-abee-856f6dfd618a"},"source":["X= df.drop(['Class'], axis = 1).values\n","Y = df['Class']\n","\n","X = StandardScaler().fit_transform(X)\n","X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state = 101)\n","\n","print(X_Train,'\\n')\n","\n","print(X.shape,'\\n', X_Train.shape,'\\n', X_Test.shape)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[-7.70121128e-01 6.63715437e-01 -7.30180207e-01 ... 2.71362876e-03\n"," 2.65934078e-02 -1.13245680e-01]\n"," [-7.66150728e-01 3.29126869e-01 -3.74069907e-01 ... -2.66516655e-02\n"," -2.31038210e-01 -2.73784228e-01]\n"," [-7.71570640e-01 3.79436522e-01 -3.14038801e-01 ... -1.53580703e-01\n"," -4.97755945e-03 -3.74006742e-01]\n"," ...\n"," [ 1.37303378e+00 -4.58716878e+00 5.32079916e+00 ... -5.27364836e+00\n"," -4.61929276e-01 -3.99477199e-01]\n"," [-7.71108477e-01 3.70351276e-01 -4.21082864e-01 ... -8.92925094e-02\n"," 3.05914764e-01 -3.09625392e-01]\n"," [-7.65856624e-01 6.92627420e-01 -5.31871199e-01 ... -1.04273833e-01\n"," -2.37201696e-02 -3.41442721e-01]] \n","\n","(900, 30) \n"," (630, 30) \n"," (270, 30)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"wDpKcZ766ADk"},"source":["Ως ταξινομητή σε αυτό το Notebook θα χρησιμοποιήσουμε ένα Random Forest Classifier ![](https://www.researchgate.net/profile/Erika_Amstalden_van_Hove/publication/228540194/figure/fig1/AS:301839783350274@1448975620988/The-Random-Forest-classifier-is-an-ensemble-of-decision-trees-where-the-single-trees-are.png) \n","[source](https://www.researchgate.net/publication/228540194_Towards_Digital_Staining_using_Imaging_Mass_Spectrometry_and_Random_Forests-Technical_Report)\n","\n","\n","Οι κύριες παράμετροι που χρησιμοποιούνται από ένα Random Forest Classifier είναι: \n","* criterion = η συνάρτηση που χρησιμοποιείται για την αξιολόγηση της ποιότητας ενός διαχωρισμού. \n","* max_depth = μέγιστος επιτρεπόμενος αριθμός επιπέδων σε κάθε δέντρο. \n","* max_features = μέγιστος αριθμός χαρακτηριστικών που λαμβάνονται υπόψη κατά τον διαχωρισμό ενός κόμβου. \n","* min_samples_leaf = ελάχιστος αριθμός δειγμάτων που μπορούν να αποθηκευτούν σε ένα φύλλο δέντρου.\n","* min_samples_split = ελάχιστος αριθμός δειγμάτων που είναι απαραίτητα σε έναν κόμβο για να προκαλέσει διαχωρισμό κόμβου. \n","* n_estimators = αριθμός δέντρων στο σύνολο.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"zqKVtXYQxl1E"},"source":["##Χειροκίνητη αναζήτηση (Manual Search) \n","\n","Κατά τη χρήση της χειροκίνητης αναζήτησης, επιλέγουμε ορισμένα μοντέλα υπερπαραμέτρων με βάση την κρίση/εμπειρία μας. \n","\n","Στη συνέχεια εκπαιδεύουμε το μοντέλο, αξιολογούμε την ακρίβειά του και αρχίζουμε ξανά τη διαδικασία. Αυτός ο βρόχος επαναλαμβάνεται έως ότου σημειωθεί ικανοποιητική ακρίβεια. \n","\n"]},{"cell_type":"code","metadata":{"id":"Uii4_c6Yuh6o","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997749398,"user_tz":-120,"elapsed":12871,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"921f01be-d34f-4b18-f177-71ecd870a425"},"source":["model = RandomForestClassifier(n_estimators=5, random_state= 101).fit(X_Train,Y_Train)\n","predictionforest = model.predict(X_Test)\n","\n","print(confusion_matrix(Y_Test,predictionforest))\n","print(classification_report(Y_Test,predictionforest))\n","acc1 = accuracy_score(Y_Test,predictionforest)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[130 0]\n"," [ 1 139]]\n"," precision recall f1-score support\n","\n"," 0 0.99 1.00 1.00 130\n"," 1 1.00 0.99 1.00 140\n","\n"," accuracy 1.00 270\n"," macro avg 1.00 1.00 1.00 270\n","weighted avg 1.00 1.00 1.00 270\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"nyNgze7Q6lA4","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997749398,"user_tz":-120,"elapsed":12865,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"597f5fd3-7110-4806-e112-8e728cdcc58c"},"source":["print(Y_Test)\n","print(predictionforest)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["184 1\n","532 0\n","501 0\n","517 0\n","871 0\n"," ..\n","416 1\n","196 1\n","83 1\n","258 1\n","432 1\n","Name: Class, Length: 270, dtype: int64\n","[1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1\n"," 0 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1\n"," 0 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0\n"," 0 0 0 1 0 0 1 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 1 1\n"," 0 1 0 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0 0 0 1\n"," 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1\n"," 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1\n"," 0 0 0 0 0 1 1 1 1 0 1]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"s6dWJ8HXmkNq"},"source":["Λαμβάνοντας υπόψη μια πραγματική ετικέτα και μια προβλεπόμενη ετικέτα, το πρώτο πράγμα που μπορούμε να κάνουμε είναι να χωρίσουμε τα δείγματά μας σε 4 κατηγορίες:\n","* True negative — actual = 0, predicted = 0\n","* False positive — actual = 0, predicted = 1\n","* False negative — actual = 1, predicted = 0\n","* True positive — actual = 1, predicted = 1\n","\n","\n","![](https://drive.google.com/uc?id=1gJ5e8GUiwcRwfiVuTLiGkM6bc92Iz2vW)\n","\n","Μπορούμε να υπολογίσουμε τον πίνακα σύγχυσης (confusion matrix) από το scikit-learn, ο οποίος λαμβάνει ως εισόδους τις πραγματικές ετικέτες και τις προβλεπόμενες ετικέτες (confusion_matrix (df.actual_label.values, df.predicted_RF.values)).\n","\n","Στο παράδειγμά μας True negative =130, False positive = 0, False negative = 1 True positive =139\n","\n","\n","Ας αλλάξουμε τις παραμέτρους στο ταξινομητή και να ελέγξουμε την ακρίβεια του μοντέλου μας:"]},{"cell_type":"code","metadata":{"id":"mvezcDf6unQJ","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997749399,"user_tz":-120,"elapsed":12859,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"9d9299e5-181c-4e78-8584-565e112d8756"},"source":["model = RandomForestClassifier(n_estimators= 200, max_features = \"log2\", min_samples_leaf = 30, random_state= 101).fit(X_Train,Y_Train)\n","predictionforest = model.predict(X_Test)\n","\n","print(confusion_matrix(Y_Test,predictionforest))\n","print(classification_report(Y_Test,predictionforest))\n","acc2 = accuracy_score(Y_Test,predictionforest)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[130 0]\n"," [ 5 135]]\n"," precision recall f1-score support\n","\n"," 0 0.96 1.00 0.98 130\n"," 1 1.00 0.96 0.98 140\n","\n"," accuracy 0.98 270\n"," macro avg 0.98 0.98 0.98 270\n","weighted avg 0.98 0.98 0.98 270\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"i750JMqDxTCZ"},"source":["Παρατηρούμε τις τιμές του πίνακα σύγχυσης μετά τις αλλαγές στις παραμέτρους του ταξινομητή. Έχουν εντοπιστεί 3 δείγματα False Negative. "]},{"cell_type":"markdown","metadata":{"id":"TOXD1052ytqo"},"source":["## Τυχαία Αναζήτηση (Random Search)\n","\n","Στην τυχαία αναζήτηση, δημιουργούμε ένα πλέγμα υπερπαραμέτρων και εκπαιδεύουμε / δοκιμάζουμε το μοντέλο μας σε έναν τυχαίο συνδυασμό αυτών των υπερπαραμέτρων.\n","\n","Το RandomizedSearchCV εφαρμόζει μια μέθοδο «προσαρμογής» και μια μέθοδο «πρόβλεψης», βελτιστοποιώντας τις παράμετρους του ταξινομητή που χρησιμοποιείται για την πρόβλεψη μέσω διασταυρούμενης επικύρωσης. "]},{"cell_type":"code","metadata":{"id":"xdMVOnr4utYh"},"source":["from sklearn.model_selection import RandomizedSearchCV\n","\n","random_search = {\n"," 'max_features': ['auto', 'sqrt','log2', None],\n"," 'n_estimators': list(np.linspace(1, 250, 120, dtype = int))\n"," }"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"pId6q4-kuwIA","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997818358,"user_tz":-120,"elapsed":81809,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"8c4153ae-8d52-4a15-d748-d2d2da53f0ee"},"source":["clf = RandomForestClassifier()\n","model = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 80, \n"," cv = 4, verbose= 5, random_state= 101, n_jobs = -1) \n"," \n"," #verbose:Controls the verbosity: the higher, the more messages, \n"," #n_jobs:int, default=1 ,Number of jobs to run in parallel. \n"," # η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή\n","\n","model.fit(X_Train,Y_Train)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Fitting 4 folds for each of 80 candidates, totalling 320 fits\n"],"name":"stdout"},{"output_type":"stream","text":["[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n","[Parallel(n_jobs=-1)]: Done 14 tasks | elapsed: 4.0s\n","[Parallel(n_jobs=-1)]: Done 68 tasks | elapsed: 14.7s\n","[Parallel(n_jobs=-1)]: Done 158 tasks | elapsed: 32.0s\n","[Parallel(n_jobs=-1)]: Done 284 tasks | elapsed: 59.0s\n","[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 1.1min finished\n"],"name":"stderr"},{"output_type":"execute_result","data":{"text/plain":["RandomizedSearchCV(cv=4, error_score=nan,\n"," estimator=RandomForestClassifier(bootstrap=True,\n"," ccp_alpha=0.0,\n"," class_weight=None,\n"," criterion='gini',\n"," max_depth=None,\n"," max_features='auto',\n"," max_leaf_nodes=None,\n"," max_samples=None,\n"," min_impurity_decrease=0.0,\n"," min_impurity_split=None,\n"," min_samples_leaf=1,\n"," min_samples_split=2,\n"," min_weight_fraction_leaf=0.0,\n"," n_estimators=100,\n"," n_jobs...\n"," random_state=None,\n"," verbose=0,\n"," warm_start=False),\n"," iid='deprecated', n_iter=80, n_jobs=-1,\n"," param_distributions={'max_features': ['auto', 'sqrt', 'log2',\n"," None],\n"," 'n_estimators': [1, 3, 5, 7, 9, 11, 13,\n"," 15, 17, 19, 21, 24, 26,\n"," 28, 30, 32, 34, 36, 38,\n"," 40, 42, 44, 47, 49, 51,\n"," 53, 55, 57, 59, 61, ...]},\n"," pre_dispatch='2*n_jobs', random_state=101, refit=True,\n"," return_train_score=False, scoring=None, verbose=5)"]},"metadata":{"tags":[]},"execution_count":13}]},{"cell_type":"code","metadata":{"id":"QKGvgxnzZqxu","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997818360,"user_tz":-120,"elapsed":81805,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"9d890ac9-d251-424b-a13d-102588aa219e"},"source":["preds = model.predict(X_Test)\n","\n","print(confusion_matrix(Y_Test,preds))\n","print(classification_report(Y_Test,preds))\n","acc3 = accuracy_score(Y_Test,preds)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[130 0]\n"," [ 0 140]]\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 130\n"," 1 1.00 1.00 1.00 140\n","\n"," accuracy 1.00 270\n"," macro avg 1.00 1.00 1.00 270\n","weighted avg 1.00 1.00 1.00 270\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"KDKT4wIOJdCJ"},"source":["## Αναζήτηση πλέγματος (Grid Search)"]},{"cell_type":"markdown","metadata":{"id":"LkBQboyeJKMc"},"source":["Η απόδοση όλων των πιθανών συνδυασμών υπερ-παραμέτρων μετασχηματιστών και εκτιμητή γίνεται με αναζήτηση πλέγματος (grid search). Ορίζουμε για κάθε παράμετρο ένα πεδίο ορισμού, συνήθως με ελάχιστο, μέγιστο και κάποιο βήμα και φτιάχνουμε ένα πλέγμα με όλους τους πιθανούς συνδυασμούς τιμών των παραμέτρων. \n","\n","Για κύριες συνιστώσες PCA (transformer) από 5 μέχρι 15 με βήμα 5 και για έναν RFC (estimator) με n_estimators δηλαδή ο αριθμός δέντρων στο σύνολο από 1 μέχρι 5 με βήμα 2 παίρνουμε το ακόλουθο grid:\n","\n","| | | | |\n","|-----|-------|--------|--------|\n","| | PC=5 | PC=10 | PC=15 |\n","| n=1 | (1,5) | (1,10) | (1,15) |\n","| n=3 | (3,5) | (3,10) | (3,15) |\n","| n=5 | (5,5) | (5,10) | (5,15) |\n","\n","Για κάθε τιμή υπερπαραμέτρων του grid θα πρέπει να υπολογιστεί ο μέσος όρος του εκτιμητή σε όλα τα folds του cross-validation με βάση το metric (πχ F1) και να επιλεχθεί ο καλύτερος συνδυασμός παραμέτρων. Η συγκεκριμένη στρατηγική αναζήτησης των βέλτιστων υπερπαραμέτρων είναι η εξαντλητική αναζήτηση πλέγματος (exhaustive grid search) και είναι προφανώς πολύ ακριβή υπολογιστικά. Υπάρχουν διάφορες τεχνικές για να περιορίζεται η πολυπλοκότητα του grid search, αλλά δεν το αποφεύγουμε γενικά, γιατί οι υπερπαράμετροι είναι ορίσματα των εκτιμητών και δεν μαθαίνονται από την fit.\n","\n","Συνοψίζοντας, η βελτιστοποίηση των υπερπαραμέτρων απαιτεί\n","\n","* έναν εκτιμητή (έναν ταξινομητή)\n","* τον πεδίο ορισμού των υπερπαραμέτρων\n","* ένα τρόπο αναζήτησης των πιθανών συνδυασμών τιμών τους πχ grid search\n","* ένα σχήμα cross-validation πχ 5-fold\n","* μια μετρική απόδοσης (ή score) πχ F1-macro\n","\n","Το scikit-learn μας απλοποιεί σε πολύ μεγάλο βαθμό την κατασκευή pipelines και τη βελτιστοποίηση των υπερπαραμέτρων.\n","\n","\n","Στη συνέχεια θα χρησιμοποιήσουμε την [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) για να βελτιστοποιήσουμε τις υπερπαραμέτρους μας. Η GridSearchCV κάνει μαζί cross-validation και grid search. "]},{"cell_type":"markdown","metadata":{"id":"sWsZXzCI0BfT"},"source":[" Την εισάγουμε και θέτουμε τις τιμές ορισμού των υπερπαραμέτρων:"]},{"cell_type":"code","metadata":{"id":"cPkbvAyTwjdN","colab":{"base_uri":"https://localhost:8080/","height":442},"executionInfo":{"status":"ok","timestamp":1604997826025,"user_tz":-120,"elapsed":89462,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"9d7b78a7-8ca5-49f6-e6e4-aaf9a174efc5"},"source":["!pip install --upgrade imbalanced-learn"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Collecting imbalanced-learn\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/c8/81/8db4d87b03b998fda7c6f835d807c9ae4e3b141f978597b8d7f31600be15/imbalanced_learn-0.7.0-py3-none-any.whl (167kB)\n","\r\u001b[K |██ | 10kB 17.0MB/s eta 0:00:01\r\u001b[K |████ | 20kB 1.6MB/s eta 0:00:01\r\u001b[K |█████▉ | 30kB 2.2MB/s eta 0:00:01\r\u001b[K |███████▉ | 40kB 2.5MB/s eta 0:00:01\r\u001b[K |█████████▉ | 51kB 1.9MB/s eta 0:00:01\r\u001b[K |███████████▊ | 61kB 2.2MB/s eta 0:00:01\r\u001b[K |█████████████▊ | 71kB 2.5MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 81kB 2.7MB/s eta 0:00:01\r\u001b[K |█████████████████▋ | 92kB 2.9MB/s eta 0:00:01\r\u001b[K |███████████████████▋ | 102kB 2.7MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 112kB 2.7MB/s eta 0:00:01\r\u001b[K |███████████████████████▌ | 122kB 2.7MB/s eta 0:00:01\r\u001b[K |█████████████████████████▌ | 133kB 2.7MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 143kB 2.7MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 153kB 2.7MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▍| 163kB 2.7MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 174kB 2.7MB/s \n","\u001b[?25hRequirement already satisfied, skipping upgrade: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from imbalanced-learn) (0.17.0)\n","Requirement already satisfied, skipping upgrade: scipy>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from imbalanced-learn) (1.4.1)\n","Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from imbalanced-learn) (1.18.5)\n","Collecting scikit-learn>=0.23\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/5c/a1/273def87037a7fb010512bbc5901c31cfddfca8080bc63b42b26e3cc55b3/scikit_learn-0.23.2-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)\n","\u001b[K |████████████████████████████████| 6.8MB 8.5MB/s \n","\u001b[?25hCollecting threadpoolctl>=2.0.0\n"," Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl\n","Installing collected packages: threadpoolctl, scikit-learn, imbalanced-learn\n"," Found existing installation: scikit-learn 0.22.2.post1\n"," Uninstalling scikit-learn-0.22.2.post1:\n"," Successfully uninstalled scikit-learn-0.22.2.post1\n"," Found existing installation: imbalanced-learn 0.4.3\n"," Uninstalling imbalanced-learn-0.4.3:\n"," Successfully uninstalled imbalanced-learn-0.4.3\n","Successfully installed imbalanced-learn-0.7.0 scikit-learn-0.23.2 threadpoolctl-2.1.0\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.colab-display-data+json":{"pip_warning":{"packages":["sklearn"]}}},"metadata":{"tags":[]}}]},{"cell_type":"code","metadata":{"id":"JrLrdMceJKMO"},"source":["#from imblearn.pipeline import Pipeline\n","from imblearn.pipeline import Pipeline\n","\n","# φέρνουμε τις γνωστές μας κλάσεις για preprocessing\n","from sklearn.decomposition import PCA\n","\n","# αρχικοποιούμε τους εκτιμητές (μετασχηματιστές και ταξινομητή) χωρίς παραμέτρους\n","pca = PCA()\n","rfc=RandomForestClassifier(random_state=42)\n","\n","pipe = Pipeline(steps=[('pca', pca), ('RFC', clf)])"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Ypk9FqwaJKMT"},"source":["Το pipeline συμπεριφέρεται ως ένας ενιαίος estimator. Μπορούμε να εφαρμόσουμε fit και predict."]},{"cell_type":"code","metadata":{"id":"TahYufqoJKMU","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997826427,"user_tz":-120,"elapsed":89854,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"c61e67ea-d53e-4bf3-8128-22a7e13d4bae"},"source":["pipe.fit(X_Train,Y_Train)\n","preds = pipe.predict(X_Test)\n","print(confusion_matrix(Y_Test,preds))\n","print(classification_report(Y_Test, preds))\n","acc4 = accuracy_score(Y_Test,preds)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[129 1]\n"," [ 5 135]]\n"," precision recall f1-score support\n","\n"," 0 0.96 0.99 0.98 130\n"," 1 0.99 0.96 0.98 140\n","\n"," accuracy 0.98 270\n"," macro avg 0.98 0.98 0.98 270\n","weighted avg 0.98 0.98 0.98 270\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"pk6d0-gyUYK4"},"source":["Επειδή ο χώρος αναζήτησης των βέλτιστων υπερπαραμέτρων αρχίζει να μεγαλώνει, ξαναορίζουμε την pipeline με την παράμετρο 'memory': για κάθε fold του crossvalidation και για καθε συνδυασμό υπερπαραμέτρων μετασχηματιστών, τα δεδομένα χρειάζεται να μετασχηματιστούν μία φορά και όχι για κάθε νέα τιμή υπερπαραμέτρων του εκτιμητή. \n","\n","Είναι πιθανό στο fit να σας εμφανιστούν κάποια warnings με τη χρήση του memory. Ξανατρέξτε το block του κώδικα."]},{"cell_type":"markdown","metadata":{"id":"j4V7PsWgUYK7"},"source":["Μπορούμε να θέτουμε τιμές στις υπερπαραμέτρους των pipelines χρησιμοποιώντας τα ονόματα των estimators, \"\\_\\_\", το όνομα της υπερπαραμέτρου, \"=\" και τις τιμές που της δίνουμε στο grid search. Επίσης μπορούμε να θέσουμε τη μετρική της απόδοσης με την παράμετρο \"scoring\". Με την παράμετρο \"cv\" ορίζουμε τον αριθμό των folds. Για βελτιστοποίηση, μπορούμε να θέσουμε την παράμετρο n_jobs=-1 ώστε να χρησιμοποιούνται όλοι οι πυρήνες του υπολογιστή (το default είναι 1)."]},{"cell_type":"code","metadata":{"id":"4vcraWV3UYK7"},"source":["#from imblearn.pipeline import Pipeline\n","from imblearn.pipeline import Pipeline\n","\n","# φέρνουμε τις γνωστές μας κλάσεις για preprocessing\n","from sklearn.decomposition import PCA\n","\n","# αρχικοποιούμε τους εκτιμητές (μετασχηματιστές και ταξινομητή) χωρίς παραμέτρους\n","pca = PCA()\n","rfc=RandomForestClassifier(random_state=42)\n","pipe = Pipeline(steps=[('pca', pca), ('RFC', rfc)], memory = 'tmp')\n","\n","# Parameters of pipelines can be set using ‘__’ separated parameter names:\n","param_grid = { \n"," 'pca__n_components': [5, 10, 15,20,25],\n"," 'RFC__max_features': ['auto', 'sqrt', 'log2', None], \n"," 'RFC__n_estimators': [1, 3, 5, 7, 9]\n"," }\n"," "],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"m7SUqok2gtu4"},"source":["from sklearn.model_selection import GridSearchCV\n","estimator = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2xNG55RzUYK9"},"source":["Το GridSearchCV είναι επίσης ένας estimator με fit και predict. Ανάλογα το search space η εκτέλεση του GridSearchCV μπορεί να πάρει αρκετό χρόνο."]},{"cell_type":"code","metadata":{"id":"43lW-tGjUYK-","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997834432,"user_tz":-120,"elapsed":97849,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"050b08e4-54e1-4e01-b595-bbf4bf64e75a"},"source":["import time\n","start_time = time.time()\n","estimator.fit(X_Train, Y_Train)\n","preds = estimator.predict(X_Test)\n","print(\"Συνολικός χρόνος fit και predict: %s seconds\" % (time.time() - start_time))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Συνολικός χρόνος fit και predict: 7.946547985076904 seconds\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Cscsqa-PUYLA"},"source":["Tυπώνουμε τον καλύτερο estimator και τον καλύτερο συνδυασμό υπερπαραμέτρων:"]},{"cell_type":"code","metadata":{"id":"j5_Vq5tlUYLA","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997834433,"user_tz":-120,"elapsed":97844,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"61dcae57-e16f-40b7-c9a3-181d1b73375f"},"source":["print(estimator.best_estimator_)\n","print(estimator.best_params_)\n","\n","print(confusion_matrix(Y_Test,preds))\n","print(classification_report(Y_Test,preds))\n","acc5=accuracy_score(Y_Test,preds)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Pipeline(memory='tmp',\n"," steps=[('pca',\n"," PCA(copy=True, iterated_power='auto', n_components=5,\n"," random_state=None, svd_solver='auto', tol=0.0,\n"," whiten=False)),\n"," ('RFC',\n"," RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n"," class_weight=None, criterion='gini',\n"," max_depth=None, max_features='auto',\n"," max_leaf_nodes=None, max_samples=None,\n"," min_impurity_decrease=0.0,\n"," min_impurity_split=None,\n"," min_samples_leaf=1, min_samples_split=2,\n"," min_weight_fraction_leaf=0.0,\n"," n_estimators=9, n_jobs=None,\n"," oob_score=False, random_state=42,\n"," verbose=0, warm_start=False))],\n"," verbose=False)\n","{'RFC__max_features': 'auto', 'RFC__n_estimators': 9, 'pca__n_components': 5}\n","[[129 1]\n"," [ 4 136]]\n"," precision recall f1-score support\n","\n"," 0 0.97 0.99 0.98 130\n"," 1 0.99 0.97 0.98 140\n","\n"," accuracy 0.98 270\n"," macro avg 0.98 0.98 0.98 270\n","weighted avg 0.98 0.98 0.98 270\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"jkddnBzAUYLC"},"source":["Η στοχαστικότητα στη διαδικασία της ταξινόμησης οφείλεται στα διαφορετικό διαχωρισμό σε folds σε κάθε επανάληψη. "]},{"cell_type":"markdown","metadata":{"id":"PX3ZkL7OJKNV"},"source":["## Progressive grid search\n","\n","Στο πεδίο ορισμού των παραμέτρων, ξεκινάμε με μεγάλα διαστήματα και σχετικά λίγα βήματα. Αν διαπιστώσουμε ότι υπαρχει μια περιοχή τιμών κάποιας παραμέτρου που δίνει καλή απόδοση μπορούμε να μικρύνουμε το διάστημα του grid search γύρω της και να βάλουμε περισσότερα βήματα."]},{"cell_type":"code","metadata":{"scrolled":true,"id":"pu8PlyLzJKNW","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997835889,"user_tz":-120,"elapsed":99293,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"f2495add-3abd-4dd7-bec6-7889c3b20ca4"},"source":["#from imblearn.pipeline import Pipeline\n","from imblearn.pipeline import Pipeline\n","\n","# φέρνουμε τις γνωστές μας κλάσεις για preprocessing\n","from sklearn.decomposition import PCA\n","\n","n_components = [15, 17]\n","max_features= ['log2', None] \n","n_estimators= [7, 8]\n","\n","pca=PCA()\n","rfc=RandomForestClassifier(random_state=42)\n","pipe = Pipeline(steps=[('pca', pca), ('RFC', rfc)], memory = 'tmp')\n","\n","\n","estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, RFC__max_features=max_features, RFC__n_estimators=n_estimators), scoring='f1_macro', n_jobs=-1)\n","estimator.fit(X_Train, Y_Train)\n","preds = estimator.predict(X_Test)\n","\n","print(classification_report(Y_Test, preds))\n","\n","\n","print(estimator.best_estimator_)\n","print(estimator.best_params_)"],"execution_count":null,"outputs":[{"output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.93 0.99 0.96 130\n"," 1 0.99 0.93 0.96 140\n","\n"," accuracy 0.96 270\n"," macro avg 0.96 0.96 0.96 270\n","weighted avg 0.96 0.96 0.96 270\n","\n","Pipeline(memory='tmp',\n"," steps=[('pca',\n"," PCA(copy=True, iterated_power='auto', n_components=17,\n"," random_state=None, svd_solver='auto', tol=0.0,\n"," whiten=False)),\n"," ('RFC',\n"," RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n"," class_weight=None, criterion='gini',\n"," max_depth=None, max_features=None,\n"," max_leaf_nodes=None, max_samples=None,\n"," min_impurity_decrease=0.0,\n"," min_impurity_split=None,\n"," min_samples_leaf=1, min_samples_split=2,\n"," min_weight_fraction_leaf=0.0,\n"," n_estimators=7, n_jobs=None,\n"," oob_score=False, random_state=42,\n"," verbose=0, warm_start=False))],\n"," verbose=False)\n","{'RFC__max_features': None, 'RFC__n_estimators': 7, 'pca__n_components': 17}\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"5LPjl9gD4vuY","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997835889,"user_tz":-120,"elapsed":99286,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"2dfefd18-77cf-4e3a-e3fe-ea3a6bcb5363"},"source":["print(confusion_matrix(Y_Test,preds))\n","print(classification_report(Y_Test, preds))\n","acc6 = accuracy_score(Y_Test,preds)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[129 1]\n"," [ 10 130]]\n"," precision recall f1-score support\n","\n"," 0 0.93 0.99 0.96 130\n"," 1 0.99 0.93 0.96 140\n","\n"," accuracy 0.96 270\n"," macro avg 0.96 0.96 0.96 270\n","weighted avg 0.96 0.96 0.96 270\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"MEJaE01V7-1U","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604997835890,"user_tz":-120,"elapsed":99281,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"7b66cb5f-5d65-42e8-9531-c250e787c98a"},"source":["print(estimator.best_estimator_)\n","print(estimator.best_params_)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Pipeline(memory='tmp',\n"," steps=[('pca',\n"," PCA(copy=True, iterated_power='auto', n_components=17,\n"," random_state=None, svd_solver='auto', tol=0.0,\n"," whiten=False)),\n"," ('RFC',\n"," RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n"," class_weight=None, criterion='gini',\n"," max_depth=None, max_features=None,\n"," max_leaf_nodes=None, max_samples=None,\n"," min_impurity_decrease=0.0,\n"," min_impurity_split=None,\n"," min_samples_leaf=1, min_samples_split=2,\n"," min_weight_fraction_leaf=0.0,\n"," n_estimators=7, n_jobs=None,\n"," oob_score=False, random_state=42,\n"," verbose=0, warm_start=False))],\n"," verbose=False)\n","{'RFC__max_features': None, 'RFC__n_estimators': 7, 'pca__n_components': 17}\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"KvSjUCRYJKNd"},"source":["Το περισσότερο fine grained grid search, αν δώσει καλύτερες τιμές θα έχει βελτιστοποιήσει τον εκτιμητή, αν όχι, τουλάχιστον θα επιβεβαιώσει ότι είμαστε σε ένα καλό τοπικό μέγιστο της συνάρτησης αξιολόγησης.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"O_-nBtTQ_Z7u"},"source":["## Αυτόματη ρύθμιση υπερπαραμέτρων (Automated Hyperparameter Tuning)\n","\n","Για την αυτόματη ρύθμιση υπερπαραμέτρων χρησιμοποιούμες τεχνικές όπως: Bayesian Optimization, Gradient Descent και Evolutionary Algorithms.\n","\n","### Βελτιστοποίηση Bayesian (Bayesian Optimization)\n","\n","Η Bayesian Optimization μπορεί να πραγματοποιηθεί στην Python χρησιμοποιώντας τη βιβλιοθήκη Hyperopt. Η βελτιστοποίηση Bayesian βασίζεται σε πιθανότητες για να βρει το ελάχιστο μιας συνάρτησης. Ο τελικός στόχος είναι να βρούμε την τιμή εισόδου σε μια συνάρτηση που μπορεί να μας δώσει τη χαμηλότερη δυνατή τιμή εξόδου. \n","\n","Η βελτιστοποίηση Bayesian έχει αποδειχθεί πιο αποτελεσματική από το τυχαίο πλέγμα ή τη χειροκίνητη αναζήτηση. Η Bayesian Optimization μπορεί, ως εκ τούτου, να οδηγήσει σε καλύτερη απόδοση στη φάση δοκιμών και σε μειωμένο χρόνο βελτιστοποίησης. \n","\n","Στο Hyperopt, η Bayesian Optimization μπορεί να εφαρμοστεί δίνοντας 3 τρεις κύριες παραμέτρους στη συνάρτηση fmin ():\n","* **Objective Function**: καθορίζει τη συνάρτηση απώλειας για ελαχιστοποίηση \n","* **Domain Space**: καθορίζει το εύρος των τιμών εισόδου που θα δοκιμαστούν (στη βελτιστοποίηση Bayesian αυτός ο χώρος δημιουργεί μια κατανομή πιθανότητας για καθέμια από τις χρησιμοποιούμενες υπερπαραμέτρους).\n","* **Optimization** Algorithm: καθορίζει τη συνάρτηση αναζήτησης που θα επιλέξει τις καλύτερες τιμές εισόδου για χρήση σε κάθε νέα επανάληψη.\n","\n","Επιπλέον, μπορεί επίσης να οριστεί στην fmin ο μέγιστος αριθμός αξιολογήσεων που θα εκτελεστούν. \n","\n","Η Bayesian Optimization μπορεί να μειώσει τον αριθμό των επαναλήψεων αναζήτησης επιλέγοντας τις τιμές εισόδου λαμβάνοντας υπόψη τα προηγούμενα αποτελέσματα. Με αυτόν τον τρόπο, μπορούμε να επικεντρώσουμε την αναζήτηση μας από την αρχή σε τιμές που είναι πιο κοντά στην επιθυμητή έξοδο. "]},{"cell_type":"code","metadata":{"id":"m7KudyTs_Zgv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604998064833,"user_tz":-120,"elapsed":328218,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"ce5aac81-1bb6-495b-92d1-b0d6a95314d4"},"source":["from hyperopt import hp, fmin, tpe, STATUS_OK, Trials\n","\n","space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),\n"," 'max_depth': hp.quniform('max_depth', 10, 1200, 10),\n"," 'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),\n"," 'min_samples_leaf': hp.uniform ('min_samples_leaf', 0, 0.5),\n"," 'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),\n"," 'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200])\n"," }\n","\n","def objective(space):\n"," model = RandomForestClassifier(criterion = space['criterion'], \n"," max_depth = space['max_depth'],\n"," max_features = space['max_features'],\n"," min_samples_leaf = space['min_samples_leaf'],\n"," min_samples_split = space['min_samples_split'],\n"," n_estimators = space['n_estimators'], \n"," )\n"," \n"," accuracy = cross_val_score(model, X_Train, Y_Train, cv = 4).mean()\n","\n"," # We aim to maximize accuracy, therefore we return it as a negative value\n"," return {'loss': -accuracy, 'status': STATUS_OK }\n"," \n","trials = Trials()\n","best = fmin(fn= objective,\n"," space= space,\n"," algo= tpe.suggest,\n"," max_evals = 80,\n"," trials= trials)\n","best\n"],"execution_count":null,"outputs":[{"output_type":"stream","text":["100%|██████████| 80/80 [03:48<00:00, 2.86s/it, best loss: -1.0]\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["{'criterion': 0,\n"," 'max_depth': 1190.0,\n"," 'max_features': 3,\n"," 'min_samples_leaf': 0.20684473576260715,\n"," 'min_samples_split': 0.6427300415396294,\n"," 'n_estimators': 2}"]},"metadata":{"tags":[]},"execution_count":25}]},{"cell_type":"markdown","metadata":{"id":"eVyBMTlj_kHu"},"source":["Μπορούμε να ανακτήσουμε το σύνολο των καλύτερων παραμέτρων που προσδιορίστηκαν και να δοκιμάσουμε το μοντέλο μας χρησιμοποιώντας το καλύτερο \"λεξικό\" που δημιουργήθηκε κατά τη διάρκεια της εκπαίδευσης. Μερικές από τις παραμέτρους έχουν αποθηκευτεί στο καλύτερο \"λεξικό\" αριθμητικά χρησιμοποιώντας δείκτες, επομένως, πρέπει πρώτα να τις μετατρέψουμε ως συμβολοσειρές πριν τις εισαγάγουμε στο Random Forest μας."]},{"cell_type":"code","metadata":{"id":"hthis__b_lOU","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604998065200,"user_tz":-120,"elapsed":328579,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"9c4a3eaf-a787-48c6-fece-06a6b71e58d6"},"source":["crit = {0: 'entropy', 1: 'gini'}\n","feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}\n","est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200}\n","\n","trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], \n"," max_depth = best['max_depth'], \n"," max_features = feat[best['max_features']], \n"," min_samples_leaf = best['min_samples_leaf'], \n"," min_samples_split = best['min_samples_split'], \n"," n_estimators = est[best['n_estimators']]\n"," ).fit(X_Train,Y_Train)\n","predictionforest = trainedforest.predict(X_Test)\n","print(confusion_matrix(Y_Test,predictionforest))\n","print(classification_report(Y_Test,predictionforest))\n","acc6 = accuracy_score(Y_Test,predictionforest)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[130 0]\n"," [ 0 140]]\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 130\n"," 1 1.00 1.00 1.00 140\n","\n"," accuracy 1.00 270\n"," macro avg 1.00 1.00 1.00 270\n","weighted avg 1.00 1.00 1.00 270\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"5vBc1NY7_r20"},"source":["## Γενετικοί Αλγόριθμοι (Genetic Algorithms)\n","Οι Γενετικοί Αλγόριθμοι προσπαθούν να εφαρμόσουν φυσικούς μηχανισμούς επιλογής σε περιβάλλοντα Μηχανικής Μάθησης. Εμπνέονται από τη διαδικασία της Φυσικής Επιλογής και επομένως ονομάζονται επίσης ως Εξελικτικοί Αλγόριθμοι.\n","\n","Ας υποθέσουμε ότι δημιουργούμε έναν πληθυσμό μοντέλων N Machine Learning με ορισμένες προκαθορισμένες υπερπαραμέτρους.\n","\n","Στη συνέχεια μπορούμε να υπολογίσουμε την ακρίβεια κάθε μοντέλου και να αποφασίσουμε να διατηρήσουμε μόνο τα μισά από τα μοντέλα (αυτά που έχουν την καλύτερη απόδοση). \n","\n","Μπορούμε τώρα να δημιουργήσουμε μερικούς απογόνους που έχουν παρόμοιες υπερπαραμέτρους με αυτούς των καλύτερων μοντέλων, ώστε να πάρουμε ξανά έναν πληθυσμό Ν μοντέλων. \n","\n","Σε αυτό το σημείο, μπορούμε να υπολογίσουμε ξανά την ακρίβεια κάθε μοντέλου και να επαναλάβουμε τον κύκλο για έναν καθορισμένο αριθμό γενεών. Με αυτόν τον τρόπο, μόνο τα καλύτερα μοντέλα θα επιβιώσουν στο τέλος της διαδικασίας.\n","\n"," Για να εφαρμόσουμε τους Γενετικούς Αλγόριθμους στη Python, μπορούμε να χρησιμοποιήσουμε τη βιβλιοθήκη TPOT Auto Machine Learning. Το TPOT είναι χτισμένο στη βιβλιοθήκη scikit-learn και μπορεί να χρησιμοποιηθεί είτε για εργασίες παλινδρόμησης είτε για ταξινόμηση."]},{"cell_type":"code","metadata":{"id":"i7kNOKHd_sDb","colab":{"base_uri":"https://localhost:8080/","height":1000,"referenced_widgets":["eff23e86f29b458090212600b5e63563","f5eea29c8af148e6b9ecf585885aa0b2","2134bca9c6f742089a253f7fb58525e8","f9de162dc6e14139b177c93fac3a0b01","74ad646f5e2a48eb9d3d0e363b71d7ed","7730fde597a24584acc0d1668ae9ba71","7f435f16c67545b384a74c29c02de8ea","2ce11d9acb4d42fd82c6215b0fc429da"]},"executionInfo":{"status":"ok","timestamp":1604998860294,"user_tz":-120,"elapsed":1123666,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"1ba9dd6c-a3d8-4ecb-de4f-217a74952f1d"},"source":["!pip install tpot\n"," \n","from tpot import TPOTClassifier\n","\n","parameters = {'criterion': ['entropy', 'gini'],\n"," 'max_depth': list(np.linspace(10, 1200, 10, dtype = int)) + [None],\n"," 'max_features': ['auto', 'sqrt','log2', None],\n"," 'min_samples_leaf': [4, 12],\n"," 'min_samples_split': [5, 10],\n"," 'n_estimators': list(np.linspace(151, 1200, 10, dtype = int))}\n"," \n","tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,\n"," verbosity= 2, early_stop= 12,\n"," config_dict=\n"," {'sklearn.ensemble.RandomForestClassifier': parameters}, \n"," cv = 4, scoring = 'accuracy')\n","tpot_classifier.fit(X_Train,Y_Train) "],"execution_count":null,"outputs":[{"output_type":"stream","text":["Collecting tpot\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/18/19/4e61af9cd13340167c7865bd55b29c2605058acb4c0aca438c45db75aa29/TPOT-0.11.6.post1-py3-none-any.whl (86kB)\n","\u001b[K |████████████████████████████████| 92kB 2.2MB/s \n","\u001b[?25hRequirement already satisfied: tqdm>=4.36.1 in /usr/local/lib/python3.6/dist-packages (from tpot) (4.41.1)\n","Requirement already satisfied: pandas>=0.24.2 in /usr/local/lib/python3.6/dist-packages (from tpot) (1.1.4)\n","Requirement already satisfied: joblib>=0.13.2 in /usr/local/lib/python3.6/dist-packages (from tpot) (0.17.0)\n","Requirement already satisfied: scikit-learn>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tpot) (0.23.2)\n","Requirement already satisfied: numpy>=1.16.3 in /usr/local/lib/python3.6/dist-packages (from tpot) (1.18.5)\n","Collecting deap>=1.2\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/0a/eb/2bd0a32e3ce757fb26264765abbaedd6d4d3640d90219a513aeabd08ee2b/deap-1.3.1-cp36-cp36m-manylinux2010_x86_64.whl (157kB)\n","\u001b[K |████████████████████████████████| 163kB 4.1MB/s \n","\u001b[?25hRequirement already satisfied: scipy>=1.3.1 in /usr/local/lib/python3.6/dist-packages (from tpot) (1.4.1)\n","Collecting update-checker>=0.16\n"," Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl\n","Collecting stopit>=1.1.1\n"," Downloading https://files.pythonhosted.org/packages/35/58/e8bb0b0fb05baf07bbac1450c447d753da65f9701f551dca79823ce15d50/stopit-1.1.2.tar.gz\n","Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.2->tpot) (2018.9)\n","Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.2->tpot) (2.8.1)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.0->tpot) (2.1.0)\n","Requirement already satisfied: requests>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from update-checker>=0.16->tpot) (2.23.0)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas>=0.24.2->tpot) (1.15.0)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot) (1.24.3)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot) (3.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot) (2020.6.20)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot) (2.10)\n","Building wheels for collected packages: stopit\n"," Building wheel for stopit (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for stopit: filename=stopit-1.1.2-cp36-none-any.whl size=11956 sha256=a3171f2ac3f805dfa2347a65083d5b76d322a019ca940fc3190dbd72b78eae7c\n"," Stored in directory: /root/.cache/pip/wheels/3c/85/2b/2580190404636bfc63e8de3dff629c03bb795021e1983a6cc7\n","Successfully built stopit\n","Installing collected packages: deap, update-checker, stopit, tpot\n","Successfully installed deap-1.3.1 stopit-1.1.2 tpot-0.11.6.post1 update-checker-0.18.0\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"eff23e86f29b458090212600b5e63563","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=84.0, style=ProgressStyle(des…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\r\n","Generation 1 - Current best internal CV score: 1.0\n","\n","Generation 2 - Current best internal CV score: 1.0\n","\n","Generation 3 - Current best internal CV score: 1.0\n","\n","Generation 4 - Current best internal CV score: 1.0\n","\n","Generation 5 - Current best internal CV score: 1.0\n","\n","Best pipeline: RandomForestClassifier(input_matrix, criterion=gini, max_depth=803, max_features=None, min_samples_leaf=4, min_samples_split=10, n_estimators=384)\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',\n"," 'gini'],\n"," 'max_depth': [10,\n"," 142,\n"," 274,\n"," 406,\n"," 538,\n"," 671,\n"," 803,\n"," 935,\n"," 1067,\n"," 1200,\n"," None],\n"," 'max_features': ['auto',\n"," 'sqrt',\n"," 'log2',\n"," None],\n"," 'min_samples_leaf': [4,\n"," 12],\n"," 'min_samples_split': [5,\n"," 10],\n"," 'n_estimators': [151,\n"," 267,\n"," 384,\n"," 500,\n"," 617,\n"," 733,\n"," 850,\n"," 966,\n"," 1083,\n"," 1200]}},\n"," crossover_rate=0.1, cv=4, disable_update_check=False,\n"," early_stop=12, generations=5, log_file=None,\n"," max_eval_time_mins=5, max_time_mins=None, memory=None,\n"," mutation_rate=0.9, n_jobs=1, offspring_size=12,\n"," periodic_checkpoint_folder=None, population_size=24,\n"," random_state=None, scoring='accuracy', subsample=1.0,\n"," template=None, use_dask=False, verbosity=2, warm_start=False)"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"code","metadata":{"id":"lIGh7dQQ_4ga","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604998860298,"user_tz":-120,"elapsed":1123662,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"c9384771-32b5-437f-bf17-484a3e18c633"},"source":["preds = tpot_classifier.predict(X_Test)\n","\n","print(classification_report(Y_Test, preds))\n","\n","acc7 = tpot_classifier.score(X_Test, Y_Test)\n"],"execution_count":null,"outputs":[{"output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 130\n"," 1 1.00 1.00 1.00 140\n","\n"," accuracy 1.00 270\n"," macro avg 1.00 1.00 1.00 270\n","weighted avg 1.00 1.00 1.00 270\n","\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"TkHDDEwo_9km"},"source":["### Artificial Neural Networks (ANNs) Tuning\n","Χρησιμοποιώντας το KerasClassifier wrapper, είναι δυνατή η εφαρμογή μοντέλων Grid Search και Random Search για Deep Learning με τον ίδιο τρόπο που χρησιμοποιήθηκε κατά τη χρήση μοντέλων μηχανικής εκμάθησης το scikit-learning. \n","\n","Στο ακόλουθο παράδειγμα, θα προσπαθήσουμε να βελτιστοποιήσουμε ορισμένες από τις παραμέτρους ANN μας, όπως: πόσους νευρώνες να χρησιμοποιήσουμε σε κάθε στρώμα και ποια συνάρτηση ενεργοποίησης και βελτιστοποίησης θα χρησιμοποιήσουμε. "]},{"cell_type":"code","metadata":{"id":"garIyTRIAAWd","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1604999759000,"user_tz":-120,"elapsed":2022356,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"f1b5e497-3147-45f6-ebc2-278f6e874e0c"},"source":["from keras.models import Sequential\n","from keras.layers import Dense, Dropout\n","from keras.wrappers.scikit_learn import KerasClassifier\n","\n","\n","def DL_Model(activation= 'linear', neurons= 5, optimizer='Adam'):\n"," model = Sequential()\n"," model.add(Dense(neurons, input_dim= 30, activation= activation))\n"," model.add(Dense(neurons, activation= activation))\n"," model.add(Dropout(0.3))\n"," model.add(Dense(1, activation='sigmoid'))\n"," model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=['accuracy'])\n"," return model\n","\n","# Definying grid parameters\n","activation = ['softmax', 'relu', 'tanh', 'sigmoid', 'linear']\n","neurons = [5, 10, 15, 25, 35, 50]\n","optimizer = ['SGD', 'Adam', 'Adamax']\n","param_grid = dict(activation = activation, neurons = neurons, optimizer = optimizer)\n","\n","clf = KerasClassifier(build_fn= DL_Model, epochs= 80, batch_size=20, verbose= 0)\n","\n","model = GridSearchCV(estimator= clf, param_grid=param_grid, n_jobs=-1)\n","\n","model.fit(X_Train,Y_Train)\n","\n","print(\"Max Accuracy Registred: {} using {}\".format(round(model.best_score_,3), model.best_params_))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Max Accuracy Registred: 0.99 using {'activation': 'softmax', 'neurons': 15, 'optimizer': 'Adam'}\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"D-48k_J6ADeQ","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1605000049826,"user_tz":-120,"elapsed":917,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"5a8980b6-2fb5-4e17-8421-4b577fe47cd4"},"source":["preds = model.predict(X_Test)\n","print(confusion_matrix(Y_Test,preds))\n","print(classification_report(Y_Test,preds))\n","acc8 = accuracy_score(Y_Test,preds)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[[130 0]\n"," [ 1 139]]\n"," precision recall f1-score support\n","\n"," 0 0.99 1.00 1.00 130\n"," 1 1.00 0.99 1.00 140\n","\n"," accuracy 1.00 270\n"," macro avg 1.00 1.00 1.00 270\n","weighted avg 1.00 1.00 1.00 270\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"MLh3_1XrxHSv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1605000054296,"user_tz":-120,"elapsed":853,"user":{"displayName":"Parask Tz","photoUrl":"","userId":"08609487936413149826"}},"outputId":"dc5da610-ce18-4c80-982f-74083f1fa6d5"},"source":["print('Base Accuracy vs Manual Search {:0.4f}%.'.format( 100 * (acc2 - acc1) / acc1))\n","print('Base Accuracy vs Random Search {:0.4f}%.'.format( 100 * (acc3 - acc1) / acc1))\n","print('Base Accuracy vs Grid Search {:0.4f}%.'.format( 100 * (acc4 - acc1) / acc1))\n","print('Base Accuracy vs Progresive Grid Search {:0.4f}%.'.format( 100 * (acc5 - acc1) / acc1))\n","print('Base Accuracy vs Bayesian Optimization Accuracy {:0.4f}%.'\n"," .format( 100 * (acc6 - acc1) / acc1))\n","print('Base Accuracy vs Evolutionary Algorithms {:0.4f}%.'\n"," .format( 100 * (acc7 - acc1) / acc1))\n","print('Base Accuracy vs Optimized ANN {:0.4f}%.'.format( 100 * (acc8 - acc1) / acc1))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Base Accuracy vs Manual Search -1.4870%.\n","Base Accuracy vs Random Search 0.3717%.\n","Base Accuracy vs Grid Search -1.8587%.\n","Base Accuracy vs Progresive Grid Search -1.4870%.\n","Base Accuracy vs Bayesian Optimization Accuracy 0.3717%.\n","Base Accuracy vs Evolutionary Algorithms 0.3717%.\n","Base Accuracy vs Optimized ANN 0.0000%.\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"GurkFuKjAci-"},"source":["Τα αποτελέσματα που λάβαμε, εξαρτώνται σε μεγάλο βαθμό από τον επιλεγμένο χώρο πλέγματος και από το σύνολο δεδομένων που χρησιμοποιείται. Επομένως, σε διαφορετικές καταστάσεις, διαφορετικές τεχνικές βελτιστοποίησης θα έχουν καλύτερη απόδοση από άλλες."]}]}