隠れマルコフモデルによる動的2Dハンドジェスチャー認識【Python/GHMM】
Pythonの隠れマルコフモデルライブラリGHMMで動的2Dハンドジェスチャー認識をやってみたので紹介します。 今回は無料で公開されている2Dジェスチャーのデータベースを使いました。 このデータベースは、画像ではなく手の位置のシーケンスのみを記録したものとなっています。ジェスチャーの種類は4種類でそれぞれ10人が複数回繰り返したものです。
以下が各ジェスチャーの軌跡を可視化した図です。
必要なもの
- 動的ハンドジェスチャーデータベース (Sebastien Marcel Dynamic Hand Gesture Database)
- 隠れマルコフライブラリGHMM
参考文献
手順(訓練)
各ジェスチャー毎に以下を行う
- データベースから訓練用データ集合を読み込む
- 読み込んだ手の座標のシーケンスをフレーム間の角度に変換後、離散化
- 初期確率行列を設定
- シーケンス毎にHMMを生成後、Baum-Welchアルゴリズムによってパラメータを学習
- 得られた複数のHMMのパラメータを平均化し、HMMをファイル出力
訓練コード
from ghmm import * def to_vector(sequence, degree): vector = [] for i in range(len(sequence)-1): symbol = int((math.atan2(sequence[i+1][1]-sequence[i][1], sequence[i+1][0]-sequence[i][0])*180/math.pi+180)/degree_divisor) if 360/degree <= symbol: symbol = 360/degree-1 vector.append(symbol) return vector def train_single(gesture_type, state=3, degree=20): print("start train "+gesture_type) # load training set train_file = open("dhg_marcel/"+gesture_type+"/train.dat") vector_set = [] sequence = [] lines = train_file.readlines() for i in range(1, len(lines)): words = lines[i].split(" ") if len(words) == 1: vector_set.append(to_vector(sequence, degree)) sequence = [] else: sequence.append([float(words[0]), float(words[1])]) vector_set.append(to_vector(sequence, degree)) train_file.close() # define initial probability pi = [] B = [] A = [] for i in range(state): if i == 0: pi.append(1) else: pi.append(0) B.append([]) a = [] for j in range(state): if j < i: a.append(0) else: a.append(1.0/(state-i)) A.append(a) number_of_emission = 360/degree for i in range(number_of_emission): B[0].append(1.0 / number_of_emission) for i in range(state): B[i] = B[0] sigma = IntegerRange(0, len(B[0])) # init summation sum_of_emission = [] for i in range(len(B)): emi = [] for j in range(len(B[0])): emi.append(0) sum_of_emission.append(emi) sum_of_transition = [] for i in range(len(A)): transition = [] for j in range(len(A[0])): transition.append(0) sum_of_transition.append(transition) # training for i in range(len(vector_set)): m = HMMFromMatrices(sigma, DiscreteDistribution(sigma), A, B, pi) m.normalize() train_seq = EmissionSequence(sigma, vector_set[i]) m.baumWelch(train_seq) for j in range(len(B)): emi = m.getEmission(j) for k in range(len(B[0])): sum_of_emission[j][k] += emi[k] for j in range(len(A)): for k in range(len(A[0])): sum_of_transition[j][k] += m.getTransition(j, k) # averaging for i in range(len(sum_of_transition)): for j in range(len(sum_of_transition[0])): sum_of_transition[i][j] /= len(vector_set) for i in range(len(sum_of_emission)): for j in range(len(sum_of_emission[0])): sum_of_emission[i][j] /= len(vector_set) hmm = HMMFromMatrices(sigma, DiscreteDistribution(sigma), A, B, pi) hmm.normalize() for i in range(len(B)): hmm.setEmission(i, sum_of_emission[i]) for i in range(len(A)): for j in range(len(A[0])): hmm.setTransition(i, j, sum_of_transition[i][j]) hmm.normalize() hmm.write(gesture_type+'.xml') def train(gtypes, state=3, degree=20): for gesture_type in gtypes: train_single(gesture_type, state, degree)
手順(テスト)
- 全ジェスチャーのHMMをファイルから読み込む
- データベースからテスト用データ集合を読み込む
- 読み込んだ手の座標のシーケンスをフレーム間の角度に変換後、離散化
- 全ジェスチャーのHMMでシーケンス毎にviterbiアルゴリズムを適用し、最も尤度が高いHMMをそれぞれ認識結果とする
テストコード
from ghmm import * def test(gtypes, state=3, degree=20): print 'start test' # load hmms and test set hmms = {} dic_of_vector_set = {} for gesture_type in gtypes: hmm = HMMOpenXML.openNewXML(gesture_type+'.xml', None) hmms[gesture_type] = hmm print gesture_type + " " + str(hmm) test_file = open("dhg_marcel/"+gesture_type+"/test.dat") vector_set = [] sequence = [] lines = test_file.readlines() for i in range(1, len(lines)): words = lines[i].split(" ") if len(words) == 1: vector_set.append(to_vector(sequence, degree)) sequence = [] else: sequence.append([float(words[0]), float(words[1])]) vector_set.append(to_vector(sequence, degree)) dic_of_vector_set[gesture_type] = vector_set test_file.close() sigma = IntegerRange(0, 360/degree) # test corrects = {} wrongs = {} for target_gesture_type in gtypes: correct_num = 0 wrong_num = 0 for vector in dic_of_vector_set[target_gesture_type]: es = EmissionSequence(sigma, vector) best_score = {'score': -9999, 'type': target_gesture_type} for gesture_type in gtypes: score = hmms[gesture_type].viterbi(es)[1] if score == 1.0: # invalid value continue if best_score['score'] < score: best_score['score'] = score best_score['type'] = gesture_type if best_score['type'] == target_gesture_type: correct_num += 1 else: wrong_num += 1 corrects[target_gesture_type] = correct_num wrongs[target_gesture_type] = wrong_num result_file = open('recognition_rate.dat', 'w') result_file.write("state:" + str(state) + " divisor:" + str(degree_divisor) + '\n') for gesture_type in gtypes: result_file.write(gesture_type+'\t') result_file.write(str(float(corrects[gesture_type])/(corrects[gesture_type]+wrongs[gesture_type]))+'\n') print gesture_type print 'wrong:' + str(wrongs[gesture_type])+' correct:'+str(corrects[gesture_type]) print 'recognition rate:' + \ str(float(corrects[gesture_type])/(corrects[gesture_type]+wrongs[gesture_type]))+'\n' result_file.close()
メイン文
if __name__ == '__main__': state_number = 3 degree_divisor = 45 gestures = ['Deictic', 'Symbolic', 'ReverseDeictic', 'ReverseSymbolic'] train(gestures, state_number, degree_divisor) test(gestures, state_number, degree_divisor)
認識結果(状態数:3、シンボル数:8)
Deictic | Symbolic | ReverseDeictic | ReverseSymbolic |
---|---|---|---|
92.9% | 100% | 90% | 93.7% |