update data

Browse Source
lorendery 3 months ago
parent
commit
cf89c2cedf

+ 3
- 0
.idea/.gitignore

@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml

+ 12
- 0
.idea/First_repo.iml

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

+ 18
- 0
.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,18 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="5">
+            <item index="0" class="java.lang.String" itemvalue="bs4" />
+            <item index="1" class="java.lang.String" itemvalue="config" />
+            <item index="2" class="java.lang.String" itemvalue="mysql-connector-python" />
+            <item index="3" class="java.lang.String" itemvalue="argparse" />
+            <item index="4" class="java.lang.String" itemvalue="scipy" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6
- 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4
- 0
.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
+</project>

+ 8
- 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/First_repo.iml" filepath="$PROJECT_DIR$/.idea/First_repo.iml" />
+    </modules>
+  </component>
+</project>

+ 6
- 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 4
- 5
code/data-preprocessing.py

@@ -1,15 +1,14 @@
 import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
-from src.const import *
-import string
+from const import *
+
 
 print(M_PRO_INIT, '\n' + M_PRO_LOAD_DATA)
 data = pd.read_csv(RAW_DATA_PATH)
 
 print(M_PRO_RMV_PUNC)
-clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''). \
-                                     translate(str.maketrans('', '', string.punctuation)))
+clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''))
 
 print(M_PRO_LE)
 y = data[TARGET_COL].map({CLASS_0: 0, CLASS_1: 1})
@@ -29,4 +28,4 @@ print(M_PRO_SAVE_DATA)
 X_train.to_csv(X_TRAIN_PATH, index=False)
 X_test.to_csv(X_TEST_PATH, index=False)
 y_train.to_csv(Y_TRAIN_PATH, index=False)
-y_test.to_csv(Y_TEST_PATH, index=False)
+y_test.to_csv(Y_TEST_PATH, index=False)

+ 2
- 2
data.dvc

@@ -1,5 +1,5 @@
 outs:
-- md5: 050f1a7f90f90a94853ffd81075d6990.dir
-  size: 99212319
+- md5: 5e80dd94f68ba134e41da81cd88afdc5.dir
+  size: 99290108
   nfiles: 5
   path: data