Не получается совместить программы

У меня есть код, который с помощью веб-камеры и cv2:

  • управляет курсором с помощью указательного пальца;
  • регулирует звук расстоянием между указательным и большим пальцем.

Я их обьединил в одну программу, а так они две автономные программы: отдельная программа для курсора, отдельная для звука, работает - и то хорошо.

Код HandTrackingModule.py:

import time

import cv2
import mediapipe as mp


class handDetector:
   def __init__(self, mode=False, maxHands=6 , complexity=1, detectionCon=0.5, trackCon=0.5):
       self.mode = mode
       self.maxHands = maxHands
       self.complexity = complexity
       self.detectionCon = detectionCon
       self.trackCon = trackCon

       self.mpHands = mp.solutions.hands
       self.hands = self.mpHands.Hands(self.mode, self.maxHands, self.complexity,
                                       self.detectionCon, self.trackCon, )
       self.mpDraw = mp.solutions.drawing_utils
       self.tipIds = [4, 8, 12, 16, 20]

   def findHands(self, img, draw=True):
       imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
       self.results = self.hands.process(imgRGB)
       # print(results.multi_hand_landmarks)
       results = self.hands.process(imgRGB)
       if self.results.multi_hand_landmarks:
           for handLms in self.results.multi_hand_landmarks:
               if draw:
                   self.mpDraw.draw_landmarks(img, handLms, self.mpHands.HAND_CONNECTIONS)
       return img

   def findPosition(self, img, handNo=0, draw=True):
       xList = []
       yList = []
       bbox = []
       self.lmList = []
       if self.results.multi_hand_landmarks:
           myHand = self.results.multi_hand_landmarks[handNo]
           for id, lm in enumerate(myHand.landmark):
               # print(id, lm)
               h, w, c = img.shape
               cx, cy = int(lm.x * w), int(lm.y * h)
               xList.append(cx)
               yList.append(cy)
               # print(id, cx, cy)
               self.lmList.append([id, cx, cy])
               if draw:
                   cv2.circle(img, (cx, cy), 5, (255, 0, 255), cv2.FILLED)

           xmin, xmax = min(xList), max(xList)
           ymin, ymax = min(yList), max(yList)
           bbox = xmin, ymin, xmax, ymax

           if draw:
               cv2.rectangle(img, (xmin - 20, ymin - 20), (xmax + 20, ymax + 20), (0, 255, 0), 2)

       return self.lmList, bbox

   def fingersUp(self):
       fingers = []
       # Thumb
       if self.lmList[self.tipIds[0]][1] > self.lmList[self.tipIds[0] - 1][1]:
           fingers.append(1)
       else:
           fingers.append(0)
       # Fingers
       for id in range(1, 5):
           if self.lmList[self.tipIds[id]][2] < self.lmList[self.tipIds[id] - 2][2]:
               fingers.append(1)
           else:
               fingers.append(0)
       # totalFingers = fingers.count(1)
       return fingers

   def findDistance(self, p1, p2, img, draw=True, r=15, t=3):
       x1, y1 = self.lmList[p1][1:]
       x2, y2 = self.lmList[p2][1:]
       cx, cy = (x1 + x2) // 2, (y1 + y2) // 2

       if draw:
           cv2.line(img, (x1, y1), (x2, y2), (255, 0, 255), t)
           cv2.circle(img, (x1, y1), r, (255, 0, 255), cv2.FILLED)
           cv2.circle(img, (x2, y2), r, (255, 0, 255), cv2.FILLED)
           cv2.circle(img, (cx, cy), r, (0, 0, 255), cv2.FILLED)
       length = math.hypot(x2 - x1, y2 - y1)

       return length, img, [x1, y1, x2, y2, cx, cy]


def main():
   pTime = 0
   cTime = 0
   cap = cv2.VideoCapture(0)
   detector = handDetector()
   while True:
       success, img = cap.read()
       img = detector.findHands(img)
       lmList, bbox = detector.findPosition(img)
       if len(lmList) != 0:
           print(lmList[4])
       cTime = time.time()
       fps = 1 / (cTime - pTime)
       pTime = cTime
       cv2.putText(img, str(int(fps)), (10, 70), cv2.FONT_HERSHEY_PLAIN, 3,
                   (255, 0, 255), 3)
       cv2.imshow("Image", img)
       cv2.waitKey(1)


if __name__ == "__main__":
   main()
..........................................................................................
AIVirtualMouse.py-


import time

import cv2
import numpy as np
import pyautogui

import HandTrackingModule as htm

######################
wCam, hCam = 900, 720
frameR = 100  # Frame Reduction
smoothening = 7  # random value
######################

pTime = 0
plocX, plocY = 0, 0
clocX, clocY = 0, 0
cap = cv2.VideoCapture(0)
cap.set(3, wCam)
cap.set(4, hCam)

detector = htm.handDetector(maxHands=1)
wScr, hScr = pyautogui.size()

while True:
   # Step1: Find the landmarks
   success, img = cap.read()
   img = detector.findHands(img)
   lmList, bbox = detector.findPosition(img)
   # Step2: Get the tip of the index and middle finger
   if len(lmList) != 0:
       x1, y1 = lmList[8][1:]
       x2, y2 = lmList[12][1:]

       # Step3: Check which fingers are up
       fingers = detector.fingersUp()
       cv2.rectangle(img, (frameR, frameR), (wCam - frameR, hCam - frameR),
                     (255, 0, 255), 2)

       # Step4: Only Index Finger: Moving Mode
       if fingers[1] == 1 and fingers[2] == 0:
           # Step5: Convert the coordinates
           x3 = np.interp(x1, (frameR, wCam - frameR), (0, wScr))
           y3 = np.interp(y1, (frameR, hCam - frameR), (0, hScr))

           # Step6: Smooth Values
           clocX = plocX + (x3 - plocX) / smoothening
           clocY = plocY + (y3 - plocY) / smoothening

           # Step7: Move Mouse
           pyautogui.moveTo(wScr - clocX, clocY)
           cv2.circle(img, (x1, y1), 15, (255, 0, 255), cv2.FILLED)
           plocX, plocY = clocX, clocY

       # Step8: Both Index and middle are up: Clicking Mode
       if fingers[1] == 1 and fingers[2] == 1:
           # Step9: Find distance between fingers
           length, img, lineInfo = detector.findDistance(8, 12, img)

           # Step10: Click mouse if distance short
           if length < 40:
               cv2.circle(img, (lineInfo[4], lineInfo[5]), 15, (0, 255, 0), cv2.FILLED)
               pyautogui.click()

   # Step11: Frame rate
   cTime = time.time()
   fps = 1 / (cTime - pTime)
   pTime = cTime
   cv2.putText(img, str(int(fps)), (28, 58), cv2.FONT_HERSHEY_PLAIN, 3, (255, 8, 8), 3)

   # Step12: Display
   cv2.imshow("Image", img)
   if cv2.waitKey(1) & 0xFF == ord('q'):
       break

cap.release()
cv2.destroyAllWindows()

VolumeHandControleAdvance.py - 
.............................................................................................

import cv2
import time
import numpy as np
import HandTrackingModule as htm
import math
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume

################################
wCam, hCam = 640, 480
################################

cap = cv2.VideoCapture(0)
cap.set(3, wCam)
cap.set(4, hCam)
pTime = 0

detector = htm.handDetector(detectionCon=0.7, maxHands=1)

devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(
   IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
# volume.GetMute()
# volume.GetMasterVolumeLevel()
volRange = volume.GetVolumeRange()
minVol = volRange[0]
maxVol = volRange[1]
vol = 0
volBar = 400
volPer = 0
area = 0
colorVol = (255, 0, 0)

while True:
   success, img = cap.read()

   # Find Hand
   img = detector.findHands(img)
   lmList, bbox = detector.findPosition(img, draw=True)
   if len(lmList) != 0:

       # Filter based on size
       area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) // 100
       # print(area)
       if 250 < area < 1000:

           # Find Distance between index and Thumb
           length, img, lineInfo = detector.findDistance(4, 8, img)
           # print(length)

           # Convert Volume
           volBar = np.interp(length, [50, 200], [400, 150])
           volPer = np.interp(length, [50, 200], [0, 100])

           # Reduce Resolution to make it smoother
           smoothness = 10
           volPer = smoothness * round(volPer / smoothness)

           # Check fingers up
           fingers = detector.fingersUp()
           # print(fingers)

           # If pinky is down set volume
           if not fingers[4]:
               volume.SetMasterVolumeLevelScalar(volPer / 100, None)
               cv2.circle(img, (lineInfo[4], lineInfo[5]), 15, (0, 255, 0), cv2.FILLED)
               colorVol = (0, 255, 0)
           else:
               colorVol = (255, 0, 0)

   # Drawings
   cv2.rectangle(img, (50, 150), (85, 400), (255, 0, 0), 3)
   cv2.rectangle(img, (50, int(volBar)), (85, 400), (255, 0, 0), cv2.FILLED)
   cv2.putText(img, f'{int(volPer)} %', (40, 450), cv2.FONT_HERSHEY_COMPLEX,
               1, (255, 0, 0), 3)
   cVol = int(volume.GetMasterVolumeLevelScalar() * 100)
   cv2.putText(img, f'Vol Set: {int(cVol)}', (400, 50), cv2.FONT_HERSHEY_COMPLEX,
               1, colorVol, 3)

   # Frame rate
   cTime = time.time()
   fps = 1 / (cTime - pTime)
   pTime = cTime
   cv2.putText(img, f'FPS: {int(fps)}', (40, 50), cv2.FONT_HERSHEY_COMPLEX,
               1, (255, 0, 0), 3)

   cv2.imshow("Img", img)
   cv2.waitKey(1)
..............................................................................................
run.py(этот код запускает AIVirtualMouse и VolumeHandControlAdvace вместе) - 

import time
from ctypes import cast, POINTER

import cv2
import numpy as np
import pyautogui
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume

import HandTrackingModule as htm

# Camera settings
wCam, hCam = 900, 720
frameR = 100  # Frame Reduction
smoothening = 7  # Smoothening factor for cursor movement

# Initialize variables
pTime = 0
plocX, plocY = 0, 0
clocX, clocY = 0, 0

# Capture video
cap = cv2.VideoCapture(0)
cap.set(3, wCam)
cap.set(4, hCam)

# Initialize hand detector
detector = htm.handDetector(maxHands=1, detectionCon=0.7)

# Get screen size
wScr, hScr = pyautogui.size()

# Audio setup
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
volRange = volume.GetVolumeRange()
minVol, maxVol = volRange[0], volRange[1]
vol, volBar, volPer = 0, 400, 0
area = 0
colorVol = (255, 0, 0)

while True:
   # Capture frame
   success, img = cap.read()

   # Find hand landmarks
   img = detector.findHands(img)
   lmList, bbox = detector.findPosition(img)

   if len(lmList) != 0:
       # Get coordinates of index and middle fingers
       x1, y1 = lmList[8][1:]
       x2, y2 = lmList[12][1:]

       # Check which fingers are up
       fingers = detector.fingersUp()

       # Drawing rectangle for cursor movement area
       cv2.rectangle(img, (frameR, frameR), (wCam - frameR, hCam - frameR), (255, 0, 255), 2)

       # Cursor Movement Mode
       if fingers[1] == 1 and fingers[2] == 0:
           # Convert coordinates
           x3 = np.interp(x1, (frameR, wCam - frameR), (0, wScr))
           y3 = np.interp(y1, (frameR, hCam - frameR), (0, hScr))

           # Smooth values
           clocX = plocX + (x3 - plocX) / smoothening
           clocY = plocY + (y3 - plocY) / smoothening

           # Move mouse
           pyautogui.moveTo(wScr - clocX, clocY)
           cv2.circle(img, (x1, y1), 15, (255, 0, 255), cv2.FILLED)
           plocX, plocY = clocX, clocY

       # Clicking Mode
       if fingers[1] == 1 and fingers[2] == 1:
           length, img, lineInfo = detector.findDistance(8, 12, img)
           if length < 40:
               cv2.circle(img, (lineInfo[4], lineInfo[5]), 15, (0, 255, 0), cv2.FILLED)
               pyautogui.click()

       # Volume Control Mode
       area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) // 100
       if 250 < area < 1000:
           length, img, lineInfo = detector.findDistance(4, 8, img)

           # Convert length to volume
           volBar = np.interp(length, [50, 200], [400, 150])
           volPer = np.interp(length, [50, 200], [0, 100])

           # Smooth out the volume changes
           smoothness = 10
           volPer = smoothness * round(volPer / smoothness)

           # Change volume if pinky is down
           if not fingers[4]:
               volume.SetMasterVolumeLevelScalar(volPer / 100, None)
               cv2.circle(img, (lineInfo[4], lineInfo[5]), 15, (0, 255, 0), cv2.FILLED)
               colorVol = (0, 255, 0)
           else:
               colorVol = (255, 0, 0)

           # Draw volume bar
           cv2.rectangle(img, (50, 150), (85, 400), (255, 0, 0), 3)
           cv2.rectangle(img, (50, int(volBar)), (85, 400), (255, 0, 0), cv2.FILLED)
           cv2.putText(img, f'{int(volPer)} %', (40, 450), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 3)

           # Display current volume
           cVol = int(volume.GetMasterVolumeLevelScalar() * 100)
           cv2.putText(img, f'Vol Set: {int(cVol)}', (400, 50), cv2.FONT_HERSHEY_COMPLEX, 1, colorVol, 3)

   # Calculate and display FPS
   cTime = time.time()
   fps = 1 / (cTime - pTime)
   pTime = cTime
   cv2.putText(img, f'FPS: {int(fps)}', (40, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 3)

   # Display the image
   cv2.imshow("Image", img)

   # Exit on 'q' key press
   if cv2.waitKey(1) & 0xFF == ord('q'):
       break

cap.release()
cv2.destroyAllWindows()

Также у меня есть код голосового ассистента - ALPHA-MAIN-APP.py

try:
    import os
    import json
    import customtkinter
    import time
    import webbrowser
    import logging
    from num2word import word
    import pyaudio
    from vosk import Model, KaldiRecognizer, SetLogLevel
    import torch
    import sounddevice as sd
    import keyboard
    import numpy
    import silero
    import threading
    from gigachat import GigaChat
    from transliterate import translit
    import re
    from num2words import num2words
    from pathlib import Path
    import speech_recognition as sr
    import codecs
except ImportError:
    print('Не все библиотеки установлены.')
    os.system(
        'pip install num2word pyaudio vosk torch sounddevice keyboard silero numpy customtkinter gigachat transliterate num2words pathlib SpeechRecognition')

# Загрузка сохранённых данных
with codecs.open(Path('files/config_alpha.json').resolve(), 'r', 'utf-8') as data:
    config = json.load(data)
    data.close()

# Активационная фраза
if config['wakeword'] == '' or config['wakeword'] == ' ':
    wakeword = "альфа"
else:
    wakeword = tuple(config['wakeword'].lower().replace(',', '').split())
# Голос синтеза речи
speaker = config['voice']

# Время приёма команд без активационной фразы
time_wait = config['time']

# Модель синтеза речи
model_id = config['sintez']

# Модель распознавания речи
if config['rasp'] == '0.22':
    model = Model('vosk-model-small-ru-0.22')
elif config['rasp'] == '0.4':
    model = Model('vosk-model-small-ru-0.4')

# Вариант распознавания
recognition = config['rasp_type']

# API GigaChat
gc_api = config['gc_api']

# Массивы с ключевыми словами, которые нужно удалить из команды или изменить
to_replace = ['найди ', 'поищи', 'включи ', 'включить ', 'включил ', 'музыка ', 'музыку ', 'песня ', 'песню', 'видео ']
to_replace_write = ['напиши', 'введи']
to_replace_special = [['точка с запятой', ';'], ['запятая', ','], ['точка', '.'], ['дефис ', '-'], ['двоеточие', ':'],
                      ['знак вопроса', '?'], ['восклицательный знак', '!']]

# Неизменяемые данные
sample_rate = 48000

language = 'ru'

device = torch.device('cpu')

put_accent = True

put_yo = True

rec = KaldiRecognizer(model, 16000)

p = pyaudio.PyAudio()

stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
stream.start_stream()

giga = GigaChat(credentials=gc_api, scope='GIGACHAT_API_PERS', verify_ssl_certs=False)

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

mic = sr.Microphone()

r = sr.Recognizer()

time_ = 0


# Распознавание речи
def listen():
    while True:
        data = stream.read(4000, exception_on_overflow=False)
        if (rec.AcceptWaveform(data)) and (len(data) > 0):
            com_rec = json.loads(rec.Result())
            if com_rec['text']:
                yield com_rec['text']


# Синтез речи
def speak(text):
    logging.info('Асссистент: ' + text)
    text = translit(text.replace('c', 'к'), 'ru').lower().replace('w', 'в').replace('x', 'кс')
    num = re.findall(r'-?\d+\+?', text)
    if num != []:
        for i in num:
            numt = num2words(int(i), lang='ru')
            text = text.replace(i, numt)
    audio = model.apply_tts(text, speaker=speaker, sample_rate=sample_rate, put_accent=put_accent, put_yo=put_yo)
    sd.play(audio, sample_rate)
    time.sleep(len(audio) / sample_rate + 1.7)
    sd.stop()


# Функции
def open_(param):
    try:
        os.startfile(param)
    except:
        webbrowser.open(param)


def search(param):
    zapros = com_rec.lower()
    for i in wakeword:
        zapros = zapros.replace(i + ' ', '')
    zapros = zapros.lower().replace('найди ', '')
    webbrowser.open('https://www.google.com/search?q=' + zapros)


def search_song(param):
    zapros = com_rec.lower()
    for i in wakeword:
        zapros = zapros.replace(i + ' ', '')
    for i in to_replace:
        zapros = zapros.replace(i, '')
    webbrowser.open('https://music.yandex.ru/search?text=' + zapros)


def search_video(param):
    zapros = com_rec.lower()
    for i in wakeword:
        zapros = zapros.replace(i + ' ', '')
    for i in to_replace:
        zapros = zapros.replace(i, '')
    webbrowser.open('https://www.youtube.com/results?search_query=' + zapros)


def browser(param):
    eval(f'{param}()')


def new_tab():
    keyboard.send('ctrl+t')


def incognito_tab():
    keyboard.send('ctrl+shift+n')


def prev_tab():
    keyboard.send('ctrl+shift+tab')


def next_tab():
    keyboard.send('ctrl+tab')


def down():
    keyboard.send('pagedown')


def up():
    keyboard.send('pageup')


def end():
    keyboard.send('end')


def home():
    keyboard.send('home')


def tell(param):
    zapros = com_rec.lower()
    for i in wakeword:
        zapros = zapros.replace(i + ' ', '')
    response = giga.chat(zapros + '. Ответ должен быть очень кратким')
    speak(response.choices[0].message.content)


def write_text(param):
    text_to_write = com_rec.lower()
    for i in wakeword:
        text_to_write = text_to_write.replace(i + ' ', '')
    for i in to_replace_write:
        text_to_write = text_to_write.replace(i + ' ', '')
    for i in to_replace_special:
        text_to_write = text_to_write.replace(' ' + i[0], i[1])
    keyboard.write(text_to_write + ' ')


def repeat_recognized_text(text):
    speak(text)


def main_func(com):
    exec = False
    global time_
    if com.startswith(wakeword) or time.time() - time_ < time_wait:
        if com.startswith(wakeword):
            time_ = time.time()
        logging.info('Распознано: ' + com)
        com = com.split()

        # Веса категорий и параметров
        with codecs.open(Path('files/we.json').resolve(), 'r', 'utf-8') as data_we:
            we = json.load(data_we)
            data_we.close()

        # Определение весов
        for i in com:
            try:
                ind_kw = 0
                for ind_kw in range(len(kw['main'][i])):
                    we['main'][kw['main'][i][ind_kw]['param']] += kw['main'][i][ind_kw]['weight']
            except:
                pass

        ca = max(we['main'], key=we['main'].get)

        for i in com:
            try:
                ind_kw = 0
                for ind_kw in range(len(kw[ca][i])):
                    we[ca][kw[ca][i][ind_kw]['param']] += kw[ca][i][ind_kw]['weight']
                exec = True
            except:
                pass

        # Исполнение команды
        if exec:
            pa = max(we[ca], key=we[ca].get)
            eval(f'{ca}(r"{pa}")')
        else:
            pass


# Ключевые фразы
with codecs.open(Path('files/kw.json').resolve(), 'r', 'utf-8') as data_kw:
    kw = json.load(data_kw)
    data_kw.close()

# Загрузка модели синтеза речи
model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=language, speaker=model_id)
model.to(device)

# Основной цикл
with mic as source:
    if recognition == 'Google Speech Recognition':
        r.adjust_for_ambient_noise(source, duration=1)
        while True:
            com_rec = r.listen(source)
            try:
                com_rec = r.recognize_google(com_rec, language='ru-RU')
                main_func(com_rec.lower())
                repeat_recognized_text(com_rec)  # Вызов функции повторения распознанного текста
            except:
                pass
    else:
        for com_rec in listen():
            main_func(com_rec.lower())
            repeat_recognized_text(com_rec)  # Вызов функции повторения распознанного текста

popitka1.py - я пробую использовать потоки

import threading


def run_assistant():
    #здесь я вставил код ALPHA_MAIN.py


def run_camera():
    #здесь полная копия кода run.py


thread_assistant = threading.Thread(target=run_assistant)
thread_camera = threading.Thread(target=run_camera)

thread_assistant.start()
thread_camera.start()
thread_assistant.join()
thread_camera.join()

Призапуске программы выше, запускается и то и то (на том спасибо), но голосовой ассистент не реагирует на команды. Он их распознаёт, на не реагирует.

В общем, я хочу добавить к run.py ещё и этого голосового ассисента, пытался , но работает так: либо ассистент и не открывается окно камеры, либо работает всё, но ассистент не реагирует на команды.
Буду очень признателен если кто поможет.


Ответы (1 шт):

Автор решения: gord1402

Кажется, нашёл вашу ошибку в функции def main_func(com):, вы используете global time_ и при выполнении кода за функцией это будет работать т.к. time_ будет объявлена глобально, но когда вы записываете код под функцию time_ будет объявлена локально в функции, тогда как global берёт только глобальные переменные. Вот код который воспроизводит эту проблему:

global_variable = 5


def main():
    parent_variable = 3

    def h():
        global parent_variable, global_variable
        parent_variable = 0
        global_variable = 0

    h()
    print(f"global_variable: {global_variable} parent_variable: {parent_variable}")


main()

Результат:

global_variable: 0 parent_variable: 3

Дабы исправить вашу проблему, вы можете вынести всё до with mic as source: за функцию (это не должно повлиять на поток). run_assistant будет выглядеть так:

def run_assistant():
    # Основной цикл
    with mic as source:
        if recognition == 'Google Speech Recognition':
            r.adjust_for_ambient_noise(source, duration=1)
            while True:
                com_rec = r.listen(source)
                try:
                    com_rec = r.recognize_google(com_rec, language='ru-RU')
                    main_func(com_rec.lower())
                    repeat_recognized_text(com_rec)  # Вызов функции повторения распознанного текста
                except:
                    pass
        else:
            for com_rec in listen():
                main_func(com_rec.lower())
                repeat_recognized_text(com_rec)  # Вызов функции повторения распознанного текста
→ Ссылка