From b33e10e452151ad05467cfd59b4da249d0d4db8c Mon Sep 17 00:00:00 2001 From: kngines Date: Tue, 18 Jul 2017 08:01:48 +0800 Subject: [PATCH] Add files via upload --- ML_Distance_20170717.ipynb | 311 ++++++++++--------------------------- 1 file changed, 85 insertions(+), 226 deletions(-) diff --git a/ML_Distance_20170717.ipynb b/ML_Distance_20170717.ipynb index 8bf27a9..4b4fa21 100644 --- a/ML_Distance_20170717.ipynb +++ b/ML_Distance_20170717.ipynb @@ -198,64 +198,65 @@ "print (d2)" ] }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# 5 编辑距离(Edit distance)" + ] + }, { "cell_type": "code", - "execution_count": 71, + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "'''\n", + "1. 编辑距离是指两个字串之间,由一个转成另一个所需的最少编辑操作次数\n", + "2. 编辑操作包括:替换、插入、删除\n", + "3. 编辑距离求的是最少编辑次数,是简单的线性动态规划(最长上升子序列属于线性动态规划)\n", + "4. python-Levenshtein 包,可以计算编辑距离,hamming(汉明)距离,Jaro-Winkler距离等\n", + "\n", + "例如将eeba转变成abac:\n", + " 11. eba(删除第一个e)\n", + " 22. aba(将剩下的e替换成a)\n", + " 33. abac(在末尾插入c)\n", + " 所以eeba和abac的编辑距离就是3.\n", + "\n", + " 俄罗斯科学家Vladimir Levenshtein在1965年提出这个概念。\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 1, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.48718151 0.9888734 ]\n", - "[ 0.25107299 0.40016103]\n", - "--------\n", - "[ 0.23610852 0.58871236]\n", - "[ 0.25107299 0.40016103]\n", - "[ 0.53436156 0.15068729]\n", - "--------\n", - "[-0.28328857 0.24947374]\n", - "[ 0.53436156 0.15068729]\n", - "[ 0.27438423 0.32450102]\n", - "--------\n", - "[ 0.25997733 -0.17381373]\n", - "[ 0.27438423 0.32450102]\n", - "[ 0.14249925 0.66702626]\n", - "--------\n", - "[ 0.13188499 -0.34252524]\n", - "[ 0.14249925 0.66702626]\n", - "[ 0.8364992 0.28387395]\n", - "--------\n", - "[-0.69399995 0.38315232]\n", - "[ 0.8364992 0.28387395]\n", - "[ 0.6463512 0.60372953]\n", - "--------\n", - "[ 0.190148 -0.31985558]\n", - "[ 0.6463512 0.60372953]\n", - "[ 0.76219307 0.00768451]\n", - "--------\n", - "[-0.11584187 0.59604502]\n", - "[ 0.76219307 0.00768451]\n", - "[ 0.30117686 0.15284347]\n", - "--------\n", - "[ 0.4610162 -0.14515896]\n", - "[ 0.30117686 0.15284347]\n", - "[ 0.73310728 0.09928858]\n", - "--------\n", - "[-0.43193041 0.05355489]\n" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'Levenshtein'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mLevenshtein\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'Levenshtein'" ] } ], "source": [ - "n=XT.shape[0]\n", - "for i in range(0,n):\n", - " for j in range(i+1,n):\n", - " print (XT[i])\n", - " print (XT[j])\n", - " delta=XT[i]-XT[j]\n", - " print ('--------')\n", - " print (delta)\n", - " break" + "from Levenshtein import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 自定义实现方法" ] }, { @@ -265,7 +266,41 @@ "collapsed": true }, "outputs": [], - "source": [] + "source": [ + "#!/user/bin/env python \n", + "# -*- coding: utf-8 -*- \n", + " \n", + "class arithmetic(): \n", + " \n", + " def __init__(self): \n", + " pass \n", + " ''''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 ''' \n", + " def levenshtein(self,first,second): \n", + " if len(first) > len(second): \n", + " first,second = second,first \n", + " if len(first) == 0: \n", + " return len(second) \n", + " if len(second) == 0: \n", + " return len(first) \n", + " first_length = len(first) + 1 \n", + " second_length = len(second) + 1 \n", + " distance_matrix = [range(second_length) for x in range(first_length)] \n", + " #print distance_matrix \n", + " for i in range(1,first_length): \n", + " for j in range(1,second_length): \n", + " deletion = distance_matrix[i-1][j] + 1 \n", + " insertion = distance_matrix[i][j-1] + 1 \n", + " substitution = distance_matrix[i-1][j-1] \n", + " if first[i-1] != second[j-1]: \n", + " substitution += 1 \n", + " distance_matrix[i][j] = min(insertion,deletion,substitution) \n", + " print distance_matrix \n", + " return distance_matrix[first_length-1][second_length-1] \n", + " \n", + "if __name__ == \"__main__\": \n", + " arith = arithmetic() \n", + " print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'" + ] }, { "cell_type": "code", @@ -306,182 +341,6 @@ "print (math.sqrt(np.dot(inv_sub, sub).dot(sub.T)))" ] }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([-1, 0])" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sub" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on function inv in module numpy.linalg.linalg:\n", - "\n", - "inv(a)\n", - " Compute the (multiplicative) inverse of a matrix.\n", - " \n", - " Given a square matrix `a`, return the matrix `ainv` satisfying\n", - " ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``.\n", - " \n", - " Parameters\n", - " ----------\n", - " a : (..., M, M) array_like\n", - " Matrix to be inverted.\n", - " \n", - " Returns\n", - " -------\n", - " ainv : (..., M, M) ndarray or matrix\n", - " (Multiplicative) inverse of the matrix `a`.\n", - " \n", - " Raises\n", - " ------\n", - " LinAlgError\n", - " If `a` is not square or inversion fails.\n", - " \n", - " Notes\n", - " -----\n", - " \n", - " .. versionadded:: 1.8.0\n", - " \n", - " Broadcasting rules apply, see the `numpy.linalg` documentation for\n", - " details.\n", - " \n", - " Examples\n", - " --------\n", - " >>> from numpy.linalg import inv\n", - " >>> a = np.array([[1., 2.], [3., 4.]])\n", - " >>> ainv = inv(a)\n", - " >>> np.allclose(np.dot(a, ainv), np.eye(2))\n", - " True\n", - " >>> np.allclose(np.dot(ainv, a), np.eye(2))\n", - " True\n", - " \n", - " If a is a matrix object, then the return value is a matrix as well:\n", - " \n", - " >>> ainv = inv(np.matrix(a))\n", - " >>> ainv\n", - " matrix([[-2. , 1. ],\n", - " [ 1.5, -0.5]])\n", - " \n", - " Inverses of several matrices can be computed at once:\n", - " \n", - " >>> a = np.array([[[1., 2.], [3., 4.]], [[1, 3], [3, 5]]])\n", - " >>> inv(a)\n", - " array([[[-2. , 1. ],\n", - " [ 1.5, -0.5]],\n", - " [[-5. , 2. ],\n", - " [ 3. , -1. ]]])\n", - "\n" - ] - } - ], - "source": [ - "help(np.linalg.inv)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 0.33333333, -0.33333333],\n", - " [-0.33333333, 1.33333333]])" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.cov(npvecA, npvecB)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([-1, 0])" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sub" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 0])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "npvec.T[1]" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0])" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "npvec.T[0]" - ] - }, { "cell_type": "markdown", "metadata": {},