Kdotm · Kdotm · Jul 18, 2017
diff --git a/ML_Distance_20170717.ipynb b/ML_Distance_20170717.ipynb
@@ -198,64 +198,65 @@
     "print (d2)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# 5 编辑距离（Edit distance）"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "1. 编辑距离是指两个字串之间，由一个转成另一个所需的最少编辑操作次数\n",
+    "2. 编辑操作包括：替换、插入、删除\n",
+    "3. 编辑距离求的是最少编辑次数，是简单的线性动态规划（最长上升子序列属于线性动态规划）\n",
+    "4. python-Levenshtein 包，可以计算编辑距离，hamming（汉明）距离，Jaro-Winkler距离等\n",
+    "\n",
+    "例如将eeba转变成abac：\n",
+    "    11. eba（删除第一个e）\n",
+    "    22. aba（将剩下的e替换成a）\n",
+    "    33. abac（在末尾插入c）\n",
+    "    所以eeba和abac的编辑距离就是3.\n",
+    "\n",
+    "    俄罗斯科学家Vladimir Levenshtein在1965年提出这个概念。\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[ 0.48718151  0.9888734 ]\n",
-      "[ 0.25107299  0.40016103]\n",
-      "--------\n",
-      "[ 0.23610852  0.58871236]\n",
-      "[ 0.25107299  0.40016103]\n",
-      "[ 0.53436156  0.15068729]\n",
-      "--------\n",
-      "[-0.28328857  0.24947374]\n",
-      "[ 0.53436156  0.15068729]\n",
-      "[ 0.27438423  0.32450102]\n",
-      "--------\n",
-      "[ 0.25997733 -0.17381373]\n",
-      "[ 0.27438423  0.32450102]\n",
-      "[ 0.14249925  0.66702626]\n",
-      "--------\n",
-      "[ 0.13188499 -0.34252524]\n",
-      "[ 0.14249925  0.66702626]\n",
-      "[ 0.8364992   0.28387395]\n",
-      "--------\n",
-      "[-0.69399995  0.38315232]\n",
-      "[ 0.8364992   0.28387395]\n",
-      "[ 0.6463512   0.60372953]\n",
-      "--------\n",
-      "[ 0.190148   -0.31985558]\n",
-      "[ 0.6463512   0.60372953]\n",
-      "[ 0.76219307  0.00768451]\n",
-      "--------\n",
-      "[-0.11584187  0.59604502]\n",
-      "[ 0.76219307  0.00768451]\n",
-      "[ 0.30117686  0.15284347]\n",
-      "--------\n",
-      "[ 0.4610162  -0.14515896]\n",
-      "[ 0.30117686  0.15284347]\n",
-      "[ 0.73310728  0.09928858]\n",
-      "--------\n",
-      "[-0.43193041  0.05355489]\n"
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'Levenshtein'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-1-1ef310874120>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mLevenshtein\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'Levenshtein'"
      ]
     }
    ],
    "source": [
-    "n=XT.shape[0]\n",
-    "for i in range(0,n):\n",
-    "    for j in range(i+1,n):\n",
-    "        print (XT[i])\n",
-    "        print (XT[j])\n",
-    "        delta=XT[i]-XT[j]\n",
-    "        print ('--------')\n",
-    "        print (delta)\n",
-    "        break"
+    "from Levenshtein import *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 自定义实现方法"
    ]
   },
   {
@@ -265,7 +266,41 @@
     "collapsed": true
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "#!/user/bin/env python  \n",
+    "# -*- coding: utf-8 -*-  \n",
+    "  \n",
+    "class arithmetic():  \n",
+    "      \n",
+    "    def __init__(self):  \n",
+    "        pass  \n",
+    "    ''''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 '''  \n",
+    "    def levenshtein(self,first,second):  \n",
+    "        if len(first) > len(second):  \n",
+    "            first,second = second,first  \n",
+    "        if len(first) == 0:  \n",
+    "            return len(second)  \n",
+    "        if len(second) == 0:  \n",
+    "            return len(first)  \n",
+    "        first_length = len(first) + 1  \n",
+    "        second_length = len(second) + 1  \n",
+    "        distance_matrix = [range(second_length) for x in range(first_length)]   \n",
+    "        #print distance_matrix  \n",
+    "        for i in range(1,first_length):  \n",
+    "            for j in range(1,second_length):  \n",
+    "                deletion = distance_matrix[i-1][j] + 1  \n",
+    "                insertion = distance_matrix[i][j-1] + 1  \n",
+    "                substitution = distance_matrix[i-1][j-1]  \n",
+    "                if first[i-1] != second[j-1]:  \n",
+    "                    substitution += 1  \n",
+    "                distance_matrix[i][j] = min(insertion,deletion,substitution)  \n",
+    "        print distance_matrix  \n",
+    "        return distance_matrix[first_length-1][second_length-1]  \n",
+    "      \n",
+    "if __name__ == \"__main__\":  \n",
+    "    arith = arithmetic()  \n",
+    "    print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'"
+   ]
   },
   {
    "cell_type": "code",
@@ -306,182 +341,6 @@
     "print (math.sqrt(np.dot(inv_sub, sub).dot(sub.T)))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([-1,  0])"
-      ]
-     },
-     "execution_count": 45,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sub"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Help on function inv in module numpy.linalg.linalg:\n",
-      "\n",
-      "inv(a)\n",
-      "    Compute the (multiplicative) inverse of a matrix.\n",
-      "    \n",
-      "    Given a square matrix `a`, return the matrix `ainv` satisfying\n",
-      "    ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``.\n",
-      "    \n",
-      "    Parameters\n",
-      "    ----------\n",
-      "    a : (..., M, M) array_like\n",
-      "        Matrix to be inverted.\n",
-      "    \n",
-      "    Returns\n",
-      "    -------\n",
-      "    ainv : (..., M, M) ndarray or matrix\n",
-      "        (Multiplicative) inverse of the matrix `a`.\n",
-      "    \n",
-      "    Raises\n",
-      "    ------\n",
-      "    LinAlgError\n",
-      "        If `a` is not square or inversion fails.\n",
-      "    \n",
-      "    Notes\n",
-      "    -----\n",
-      "    \n",
-      "    .. versionadded:: 1.8.0\n",
-      "    \n",
-      "    Broadcasting rules apply, see the `numpy.linalg` documentation for\n",
-      "    details.\n",
-      "    \n",
-      "    Examples\n",
-      "    --------\n",
-      "    >>> from numpy.linalg import inv\n",
-      "    >>> a = np.array([[1., 2.], [3., 4.]])\n",
-      "    >>> ainv = inv(a)\n",
-      "    >>> np.allclose(np.dot(a, ainv), np.eye(2))\n",
-      "    True\n",
-      "    >>> np.allclose(np.dot(ainv, a), np.eye(2))\n",
-      "    True\n",
-      "    \n",
-      "    If a is a matrix object, then the return value is a matrix as well:\n",
-      "    \n",
-      "    >>> ainv = inv(np.matrix(a))\n",
-      "    >>> ainv\n",
-      "    matrix([[-2. ,  1. ],\n",
-      "            [ 1.5, -0.5]])\n",
-      "    \n",
-      "    Inverses of several matrices can be computed at once:\n",
-      "    \n",
-      "    >>> a = np.array([[[1., 2.], [3., 4.]], [[1, 3], [3, 5]]])\n",
-      "    >>> inv(a)\n",
-      "    array([[[-2. ,  1. ],\n",
-      "            [ 1.5, -0.5]],\n",
-      "           [[-5. ,  2. ],\n",
-      "            [ 3. , -1. ]]])\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "help(np.linalg.inv)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 0.33333333, -0.33333333],\n",
-       "       [-0.33333333,  1.33333333]])"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.cov(npvecA, npvecB)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([-1,  0])"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sub"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1, 0])"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "npvec.T[1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([0, 0])"
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "npvec.T[0]"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},