使用 chatgpt 从采访脚本中提取问题和答案

标签：python machine-learning openai-api chatgpt-api google-generativeai

我的要求是从面试脚本中提取问题和答案，并基于 Skill.csv 文件中给出的技能列表。我创建了生成人工智能模型来从给定的技能列表中提取问题和答案。但它只给出了 30% 正确的问题和答案以及技能标签。请找到我的下面的代码并执行需要的操作。我的输入是 csv 文件，其中有一列包含“pyhton、sql、jave、power bi 等技能”，interivew 脚本采用 json 格式的形式

from flask import Flask, request, jsonify
import json
import logging
import re
import pandas as pd
from pydantic import BaseModel, ValidationError
from typing import List, Dict, Optional
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
import openai

# Define your models
class QuestionAnswer(BaseModel):
    question: str
    answer: str

class SkillQnA(BaseModel):
    skill: str
    question_and_answers: Dict[str, str]

class ExtractedSkills(BaseModel):
    skills: List[SkillQnA]

app = Flask(__name__)

# Setup logging
logging.basicConfig(level=logging.INFO)
openai.api_key = 'xxx'
# Load Sentence-BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

nltk.download('stopwords')
nltk.download('punkt')

# Initialize the stop words list
stop_words = set(stopwords.words('english'))

# Load skills from CSV file
csv_path = 'skill.csv'
skills_df = pd.read_csv(csv_path)
known_skills = set(skills_df['skill'].str.lower().str.strip())
skills_df = pd.read_csv(csv_path)


def extract_skills_from_transcript(script: str) -> set:
    # Convert the entire script to lowercase using Python
    script_lower = script.lower()

    # Extract individual words and multi-word phrases
    words = set(re.findall(r'\b\w+\b', script_lower))
    phrases = set(re.findall(r'\b(?:\w+\s+){0,1}\w+\b', script_lower))

    # Find matching skills
    extracted_skills = {skill for skill in known_skills if skill in words or skill in 
  phrases}
    return extracted_skills

# Function to extract skills from the transcript using the known skills from CSV
def extract_skills_from_csv(script: str) -> set:
    words = set(re.findall(r'\b\w+\b', script.lower()))
    extracted_skills = {word for word in words if word in known_skills}
    return extracted_skills

# Function to calculate semantic similarity
def semantic_similarity(question: str, answer: str) -> float:
    question_embedding = model.encode(question, convert_to_tensor=True)
    answer_embedding = model.encode(answer, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(question_embedding, answer_embedding)
    return similarity.item()

# Function to extract Q&A pairs using OpenAI API
def extract_qna_using_openai(script: List[Dict[str, str]], skills: List[str]) -> Optional[ExtractedSkills]:
    transcript = json.dumps(script, indent=2)
    skills_str = ", ".join(skills)

    prompt = f"""
    Given the following interview transcript and a list of skills, extract all questions and answers related to each skill.
    Extract only the questions and answers relevant to each skill listed below and group them by the name of the skill.
    Ensure that the answers are accurate and correspond correctly to the related questions and skills.
    Do not include questions and answers for any skills not listed.
    Ensure the same answers do not repeat across multiple skill groups. If an answer is relevant to multiple skills, 
    include only the portion specific to each skill in the respective groups.
    Please ensure that you will not include soft skills as skills.
    List the skills in numeric order.

    Skills: {skills_str}

    Transcript:
    {transcript}

    Provide the output in the following format:
    [
        {{
            "skill": "name of skill1",
            "question_and_answers": {{
                "question1": "answer portion specific to skill1",
                "question2": "answer portion specific to skill1"
            }}
        }},
        {{
            "skill": "name of skill2",
            "question_and_answers": {{
                "question1": "answer portion specific to skill2",
                "question2": "answer portion specific to skill2"
            }}
        }},
        ...
        {{
            "skill": "name of skillN",
            "question_and_answers": {{
                "question1": "answer portion specific to skillN",
                "question2": "answer portion specific to skillN"
            }}
        }}
    ]
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=1500
        )

        if 'choices' not in response or not response['choices']:
            logging.error("No choices in OpenAI API response")
            return None

        raw_response = response['choices'][0]['message'].get('content', None)
        logging.info(f"Raw response from OpenAI: {raw_response}")

        if not raw_response:
            logging.error("Received empty response from OpenAI API.")
            return None

        start_index = raw_response.find("[")
        end_index = raw_response.rfind("]") + 1
        json_str = raw_response[start_index:end_index]
        logging.info(f"Extracted JSON string: {json_str}")

        result = json.loads(json_str)
        logging.info(f"Parsed JSON result: {result}")

        # Validate the result and ensure it matches the expected format
        validated_result = ExtractedSkills(skills=[SkillQnA(**item) for item in result if item['skill'] in skills])
    except (json.JSONDecodeError, ValueError, ValidationError) as e:
        logging.error(f"Error decoding JSON: {e}")
        return None
    except openai.error.OpenAIError as e:
        logging.error(f"OpenAI API error: {e}")
        return None
    except Exception as e:
        logging.error(f"Exception occurred: {e}")
        return None

    return validated_result

# Function to process Q&A pairs
def process_qna_pairs(extracted_skills: ExtractedSkills, script: List[Dict[str, str]]) -> ExtractedSkills:
    # Step 1: Relevance Check
    relevant_qna = []
    for skill_qna in extracted_skills.skills:
        filtered_qna = {q: a for q, a in skill_qna.question_and_answers.items() if is_relevant_qna(q, a, skill_qna.skill)}
        if filtered_qna:
            relevant_qna.append(SkillQnA(skill=skill_qna.skill, question_and_answers=filtered_qna))

    # Step 2: Question Validation
    valid_qna = []
    for skill_qna in relevant_qna:
        filtered_qna = {q: a for q, a in skill_qna.question_and_answers.items() if is_valid_question(q)}
        if filtered_qna:
            valid_qna.append(SkillQnA(skill=skill_qna.skill, question_and_answers=filtered_qna))

    # Step 3: Add Missing Important Questions
    for skill_qna in valid_qna:
        important_questions = get_important_questions(skill_qna.skill, script, list(skill_qna.question_and_answers.keys()))
        for important_question in important_questions:
            if important_question not in skill_qna.question_and_answers:
                skill_qna.question_and_answers[important_question] = generate_answer_for_question(important_question, script)

    # Step 4: Cleanup
    for skill_qna in valid_qna:
        for q in skill_qna.question_and_answers:
            skill_qna.question_and_answers[q] = cleanup_answer(skill_qna.question_and_answers[q])

    return ExtractedSkills(skills=valid_qna)

def is_relevant_qna(question: str, answer: str, skill: str) -> bool:
    # Check if the skill is mentioned in the question or answer
    if re.search(r'\b' + re.escape(skill) + r'\b', question, re.IGNORECASE):
        return True
    if re.search(r'\b' + re.escape(skill) + r'\b', answer, re.IGNORECASE):
        return True

    # Calculate semantic similarity
    return semantic_similarity(question, answer) > 0.4  # Adjust the threshold as needed

def is_valid_question(question: str) -> bool:
    # Check if the question is well-formed (e.g., ends with a question mark) and contains at least a few words
    return question.strip().endswith('?') and len(question.split()) > 2

def get_important_questions(skill: str, script: List[Dict[str, str]], existing_questions: List[str]) -> List[str]:
    important_questions = []  # Initialize an empty list for important questions
    for item in script:
        text = item.get('content', '')  # Get the text content of the script item
        if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
            # If the skill is mentioned in the text, extract questions related to the skill
            questions = re.findall(r'([^?]*\b' + re.escape(skill) + r'\b[^?]*\?)', text, re.IGNORECASE)
            for question in questions:
                if question not in existing_questions:  # Check if the question is not already in existing Q&A pairs
                    important_questions.append(question)  # Add the question to the list if it is not already present
    return important_questions  # Return the list of important questions

def generate_answer_for_question(question: str, script: List[Dict[str, str]]) -> str:
    for item in script:
        text = item.get('content', '')
        if question in text:
            # Extract text following the question
            answer_match = re.search(r'{}(.*?)(?=\n|$)'.format(re.escape(question)), text)
            if answer_match:
                return answer_match.group(1).strip()
    return "Generated answer based on the script"

def cleanup_answer(answer: str) -> str:
    # Tokenize the answer into words
    words = nltk.word_tokenize(answer)

    # Remove stop words and filler words
    cleaned_words = [word for word in words if word.lower() not in stop_words]

    # Join the cleaned words back into a single string
    cleaned_answer = ' '.join(cleaned_words)

    # Remove any extra spaces
    cleaned_answer = re.sub(r'\s+', ' ', cleaned_answer)

    return cleaned_answer.strip()

# Flask route to handle the processing of the transcript
@app.route('/process_transcript', methods=['POST'])
def process_transcript():
    logging.info("Received request to process transcript")

    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    if file:
        try:
            # Load the JSON data from the file
            data = json.load(file)
            logging.info("JSON data loaded successfully")

            # Extract Skills
            script_text = " ".join([segment['content'] for segment in data])
            extracted_skills = extract_skills_from_transcript(script_text)
            logging.info(f"Extracted Skills: {extracted_skills}")

            # Extract Q&A Pairs
            extracted_qna = extract_qna_using_openai(data, list(extracted_skills))
            if not extracted_qna:
                return jsonify({"error": "Failed to extract Q&A pairs"}), 500
            logging.info(f"Extracted Q&A Pairs: {extracted_qna.dict()}")

            # Validate Q&A Pairs
            processed_qna = process_qna_pairs(extracted_qna, data)
            logging.info("Processed Q&A pairs successfully")

            return jsonify({
                "skills": list(extracted_skills),
                "extracted_qna": extracted_qna.dict(),
                "validated_qna": processed_qna.dict()
            }), 200

        except json.JSONDecodeError as e:
            logging.error(f"JSON decode error: {e}")
            return jsonify({"error": "Invalid JSON format"}), 400
        except ValidationError as e:
            logging.error(f"Validation error: {e}")
            logging.error(f"Validation errors detail: {e.errors()}")
            return jsonify({"error": "Data validation failed"}), 400
        except Exception as e:
            logging.error(f"An unexpected error occurred: {e}")
            return jsonify({"error": str(e)}), 500
      else:
        return jsonify({"error": "Invalid file format"}), 400
   if __name__ == '__main__':
    app.run(debug=True, port=5003)

你的代码结构良好，并做出了明智的选择（使用Sentence-BERT进行语义相似性计算，以及使用OpenAI进行问答提取）。然而，你遇到的准确率低问题（只有30%的准确率）是由于依赖大型语言模型（如GPT-3.5-turbo）的固有挑战造成的，这些模型有时会产生幻觉或偏离指令。

以下是如何改进代码以解决这些问题并提高准确率的方法：

改进OpenAI提示词 ：
- 更具体 ：在你的提示词中加入更多结构和限制。明确指定所需的输出格式，并提供示例。
- 基于角色的提示词 ：考虑采用“面试官”和“候选人”的角色，让模型更容易理解对话的结构。
- 分解任务 ：与其要求模型一次性提取所有内容，不如将任务分解成更小的步骤（例如，首先提取与技能相关的所有句子，然后在这些句子中识别问题和答案）。
实现后处理步骤 ：
- 答案验证 ：在OpenAI的输出中加入一个步骤，根据文稿验证提取到的答案。如果答案与文稿不一致，则将其标记为需要人工审查。
- 问题去重 ：实现逻辑以识别和删除重复或非常相似的问题，确保每个技能只有一个问题集。
微调模型 ：
- 对于更准确和针对特定领域的结果，考虑使用类似技能的问答对数据集对Sentence-BERT进行微调。这将提高语义相似性计算的准确率。
探索其他问答模型 ：
- 除了OpenAI之外，还可以探索其他问答模型，例如Hugging Face Transformers提供的模型。某些模型可能更适合你的特定用例。

以下是结合这些改进后的代码示例：

import json
import re
import logging
from typing import List, Dict, Optional

import nltk
import pandas as pd
from flask import Flask, request, jsonify
from nltk.corpus import stopwords
from pydantic import BaseModel, ValidationError
from sentence_transformers import SentenceTransformer, util
import openai

# ...（你现有的代码，用于模型加载、技能加载和实用函数）

def extract_qna_using_openai(script: List[Dict[str, str]], skills: List[str]) -> Optional[ExtractedSkills]:
    transcript = "\n".join([f"Interviewer: {segment['content']}" if segment['speaker'] == 'Interviewer' else f"Candidate: {segment['content']}"  for segment in script])
    skills_str = ", ".join(skills)

    prompt = f"""
    You are an interviewer assistant tasked with analyzing an interview transcript to identify questions and answers related to specific technical skills. 

    Your objective is to extract only the questions asked by the Interviewer and the corresponding answers given by the Candidate that are relevant to the following technical skills: {skills_str}. 

    Here are some important rules to follow:

    * **Focus on technical skills:**  Only extract information related to the listed technical skills. Avoid extracting information about soft skills or personality traits. 
    * **Accuracy is key:** Ensure that the extracted answers accurately reflect the candidate's response to the question and are relevant to the associated skill. 
    * **Avoid repetition:** Do not repeat the same answer for multiple skills. If an answer is relevant to multiple skills, include only the portion specific to each skill. 

    **Interview Transcript:**

    ```
    {transcript}
    ```

    **Output Format:**

    ```json
    [
        {{
            "skill": "skill 1",
            "question_and_answers": {{
                "question 1 related to skill 1": "answer 1 specific to skill 1",
                "question 2 related to skill 1": "answer 2 specific to skill 1"
            }}
        }},
        {{
            "skill": "skill 2",
            "question_and_answers": {{
                "question 1 related to skill 2": "answer 1 specific to skill 2"
            }}
        }},
        ...
    ]
    ```
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,  # Adjust temperature for creativity
            max_tokens=2000   # Adjust max_tokens as needed
        )

        # ...（你现有的代码，用于处理OpenAI响应和验证）

# ...（你现有的代码，用于后处理、Flask路由和运行应用程序）

记住：用你的实际数据测试这些更改，并根据需要调整参数和提示词。准确率的提高是一个迭代过程，需要根据你的特定数据集和需求进行实验。

标签：python,machine-learning,openai-api,chatgpt-api,google-generativeai
From： 78812091

使用 chatgpt 从采访脚本中提取问题和答案

相关文章

赞助商

阅读排行