Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
311 changes: 85 additions & 226 deletions ML_Distance_20170717.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -198,64 +198,65 @@
"print (d2)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# 5 编辑距离(Edit distance)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"'''\n",
"1. 编辑距离是指两个字串之间,由一个转成另一个所需的最少编辑操作次数\n",
"2. 编辑操作包括:替换、插入、删除\n",
"3. 编辑距离求的是最少编辑次数,是简单的线性动态规划(最长上升子序列属于线性动态规划)\n",
"4. python-Levenshtein 包,可以计算编辑距离,hamming(汉明)距离,Jaro-Winkler距离等\n",
"\n",
"例如将eeba转变成abac:\n",
" 11. eba(删除第一个e)\n",
" 22. aba(将剩下的e替换成a)\n",
" 33. abac(在末尾插入c)\n",
" 所以eeba和abac的编辑距离就是3.\n",
"\n",
" 俄罗斯科学家Vladimir Levenshtein在1965年提出这个概念。\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.48718151 0.9888734 ]\n",
"[ 0.25107299 0.40016103]\n",
"--------\n",
"[ 0.23610852 0.58871236]\n",
"[ 0.25107299 0.40016103]\n",
"[ 0.53436156 0.15068729]\n",
"--------\n",
"[-0.28328857 0.24947374]\n",
"[ 0.53436156 0.15068729]\n",
"[ 0.27438423 0.32450102]\n",
"--------\n",
"[ 0.25997733 -0.17381373]\n",
"[ 0.27438423 0.32450102]\n",
"[ 0.14249925 0.66702626]\n",
"--------\n",
"[ 0.13188499 -0.34252524]\n",
"[ 0.14249925 0.66702626]\n",
"[ 0.8364992 0.28387395]\n",
"--------\n",
"[-0.69399995 0.38315232]\n",
"[ 0.8364992 0.28387395]\n",
"[ 0.6463512 0.60372953]\n",
"--------\n",
"[ 0.190148 -0.31985558]\n",
"[ 0.6463512 0.60372953]\n",
"[ 0.76219307 0.00768451]\n",
"--------\n",
"[-0.11584187 0.59604502]\n",
"[ 0.76219307 0.00768451]\n",
"[ 0.30117686 0.15284347]\n",
"--------\n",
"[ 0.4610162 -0.14515896]\n",
"[ 0.30117686 0.15284347]\n",
"[ 0.73310728 0.09928858]\n",
"--------\n",
"[-0.43193041 0.05355489]\n"
"ename": "ModuleNotFoundError",
"evalue": "No module named 'Levenshtein'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-1ef310874120>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mLevenshtein\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'Levenshtein'"
]
}
],
"source": [
"n=XT.shape[0]\n",
"for i in range(0,n):\n",
" for j in range(i+1,n):\n",
" print (XT[i])\n",
" print (XT[j])\n",
" delta=XT[i]-XT[j]\n",
" print ('--------')\n",
" print (delta)\n",
" break"
"from Levenshtein import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 自定义实现方法"
]
},
{
Expand All @@ -265,7 +266,41 @@
"collapsed": true
},
"outputs": [],
"source": []
"source": [
"#!/user/bin/env python \n",
"# -*- coding: utf-8 -*- \n",
" \n",
"class arithmetic(): \n",
" \n",
" def __init__(self): \n",
" pass \n",
" ''''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 ''' \n",
" def levenshtein(self,first,second): \n",
" if len(first) > len(second): \n",
" first,second = second,first \n",
" if len(first) == 0: \n",
" return len(second) \n",
" if len(second) == 0: \n",
" return len(first) \n",
" first_length = len(first) + 1 \n",
" second_length = len(second) + 1 \n",
" distance_matrix = [range(second_length) for x in range(first_length)] \n",
" #print distance_matrix \n",
" for i in range(1,first_length): \n",
" for j in range(1,second_length): \n",
" deletion = distance_matrix[i-1][j] + 1 \n",
" insertion = distance_matrix[i][j-1] + 1 \n",
" substitution = distance_matrix[i-1][j-1] \n",
" if first[i-1] != second[j-1]: \n",
" substitution += 1 \n",
" distance_matrix[i][j] = min(insertion,deletion,substitution) \n",
" print distance_matrix \n",
" return distance_matrix[first_length-1][second_length-1] \n",
" \n",
"if __name__ == \"__main__\": \n",
" arith = arithmetic() \n",
" print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'"
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -306,182 +341,6 @@
"print (math.sqrt(np.dot(inv_sub, sub).dot(sub.T)))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-1, 0])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on function inv in module numpy.linalg.linalg:\n",
"\n",
"inv(a)\n",
" Compute the (multiplicative) inverse of a matrix.\n",
" \n",
" Given a square matrix `a`, return the matrix `ainv` satisfying\n",
" ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``.\n",
" \n",
" Parameters\n",
" ----------\n",
" a : (..., M, M) array_like\n",
" Matrix to be inverted.\n",
" \n",
" Returns\n",
" -------\n",
" ainv : (..., M, M) ndarray or matrix\n",
" (Multiplicative) inverse of the matrix `a`.\n",
" \n",
" Raises\n",
" ------\n",
" LinAlgError\n",
" If `a` is not square or inversion fails.\n",
" \n",
" Notes\n",
" -----\n",
" \n",
" .. versionadded:: 1.8.0\n",
" \n",
" Broadcasting rules apply, see the `numpy.linalg` documentation for\n",
" details.\n",
" \n",
" Examples\n",
" --------\n",
" >>> from numpy.linalg import inv\n",
" >>> a = np.array([[1., 2.], [3., 4.]])\n",
" >>> ainv = inv(a)\n",
" >>> np.allclose(np.dot(a, ainv), np.eye(2))\n",
" True\n",
" >>> np.allclose(np.dot(ainv, a), np.eye(2))\n",
" True\n",
" \n",
" If a is a matrix object, then the return value is a matrix as well:\n",
" \n",
" >>> ainv = inv(np.matrix(a))\n",
" >>> ainv\n",
" matrix([[-2. , 1. ],\n",
" [ 1.5, -0.5]])\n",
" \n",
" Inverses of several matrices can be computed at once:\n",
" \n",
" >>> a = np.array([[[1., 2.], [3., 4.]], [[1, 3], [3, 5]]])\n",
" >>> inv(a)\n",
" array([[[-2. , 1. ],\n",
" [ 1.5, -0.5]],\n",
" [[-5. , 2. ],\n",
" [ 3. , -1. ]]])\n",
"\n"
]
}
],
"source": [
"help(np.linalg.inv)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.33333333, -0.33333333],\n",
" [-0.33333333, 1.33333333]])"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.cov(npvecA, npvecB)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-1, 0])"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 0])"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"npvec.T[1]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0])"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"npvec.T[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down