首页 > 其他分享 >office pdf 文本提取

office pdf 文本提取

时间:2023-07-03 16:32:27浏览次数:40  
标签:const office int PID FILTER uint pdf 文本 public

using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;namespace IFilter
{
[Flags]
public enum IFILTER_INIT : uint
{
   NONE = 0,
   CANON_PARAGRAPHS = 1,
   HARD_LINE_BREAKS = 2,
   CANON_HYPHENS = 4,
   CANON_SPACES = 8,
   APPLY_INDEX_ATTRIBUTES = 16,
   APPLY_CRAWL_ATTRIBUTES = 256,
   APPLY_OTHER_ATTRIBUTES = 32,
   INDEXING_ONLY = 64,
   SEARCH_LINKS = 128,
   FILTER_OWNED_VALUE_OK = 512
}public enum CHUNK_BREAKTYPE
{
   CHUNK_NO_BREAK = 0,
   CHUNK_EOW = 1,
   CHUNK_EOS = 2,
   CHUNK_EOP = 3,
   CHUNK_EOC = 4
}[Flags]
public enum CHUNKSTATE
{
   CHUNK_TEXT = 0x1,
   CHUNK_VALUE = 0x2,
   CHUNK_FILTER_OWNED_VALUE = 0x4
}[StructLayout(LayoutKind.Sequential)]
public struct PROPSPEC
{
   public uint ulKind;
   public uint propid;
   public IntPtr lpwstr;
}[StructLayout(LayoutKind.Sequential)]
public struct FULLPROPSPEC
{
   public Guid guidPropSet;
   public PROPSPEC psProperty;
}[StructLayout(LayoutKind.Sequential)]
public struct STAT_CHUNK
{
   public uint idChunk;
   [MarshalAs(UnmanagedType.U4)] public CHUNK_BREAKTYPE breakType;
   [MarshalAs(UnmanagedType.U4)] public CHUNKSTATE flags;
   public uint locale;
   [MarshalAs(UnmanagedType.Struct)] public FULLPROPSPEC attribute;
   public uint idChunkSource;
   public uint cwcStartSource;
   public uint cwcLenSource;
}[StructLayout(LayoutKind.Sequential)]
public struct FILTERREGION
{
   public uint idChunk;
   public uint cwcStart;
   public uint cwcExtent;
}[ComImport]
[Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
   [PreserveSig]
   int Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, uint cAttributes, [MarshalAs(UnmanagedType.LPArray, SizeParamIndex=1)] FULLPROPSPEC[] aAttributes, ref uint pdwFlags);   [PreserveSig]
   int GetChunk(out STAT_CHUNK pStat);   [PreserveSig]
   int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer);   void GetValue(ref UIntPtr ppPropValue);
   void BindRegion([MarshalAs(UnmanagedType.Struct)] FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk);
}[ComImport]
[Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")]
public class CFilter
{
}public class IFilterConstants
{
   public const uint PID_STG_DIRECTORY = 0x00000002;
   public const uint PID_STG_CLASSID = 0x00000003;
   public const uint PID_STG_STORAGETYPE = 0x00000004;
   public const uint PID_STG_VOLUME_ID = 0x00000005;
   public const uint PID_STG_PARENT_WORKID = 0x00000006;
   public const uint PID_STG_SECONDARYSTORE = 0x00000007;
   public const uint PID_STG_FILEINDEX = 0x00000008;
   public const uint PID_STG_LASTCHANGEUSN = 0x00000009;
   public const uint PID_STG_NAME = 0x0000000a;
   public const uint PID_STG_PATH = 0x0000000b;
   public const uint PID_STG_SIZE = 0x0000000c;
   public const uint PID_STG_ATTRIBUTES = 0x0000000d;
   public const uint PID_STG_WRITETIME = 0x0000000e;
   public const uint PID_STG_CREATETIME = 0x0000000f;
   public const uint PID_STG_ACCESSTIME = 0x00000010;
   public const uint PID_STG_CHANGETIME = 0x00000011;
   public const uint PID_STG_CONTENTS = 0x00000013;
   public const uint PID_STG_SHORTNAME = 0x00000014;
   public const int FILTER_E_END_OF_CHUNKS = (unchecked((int) 0x80041700));
   public const int FILTER_E_NO_MORE_TEXT = (unchecked((int) 0x80041701));
   public const int FILTER_E_NO_MORE_VALUES = (unchecked((int) 0x80041702));
   public const int FILTER_E_NO_TEXT = (unchecked((int) 0x80041705));
   public const int FILTER_E_NO_VALUES = (unchecked((int) 0x80041706));
   public const int FILTER_S_LAST_TEXT = (unchecked((int) 0x00041709));
}/// 
/// IFilter return codes
/// 
public enum IFilterReturnCodes : uint
{
   /// 
   /// Success
   /// 
   S_OK = 0,
   /// 
   /// The function was denied access to the filter file. 
   /// 
   E_ACCESSDENIED = 0x80070005,
   /// 
   /// The function encountered an invalid handle, probably due to a low-memory situation. 
   /// 
   E_HANDLE = 0x80070006,
   /// 
   /// The function received an invalid parameter.
   /// 
   E_INVALIDARG = 0x80070057,
   /// 
   /// Out of memory
   /// 
   E_OUTOFMEMORY = 0x8007000E,
   /// 
   /// Not implemented
   /// 
   E_NOTIMPL = 0x80004001,
   /// 
   /// Unknown error
   /// 
   E_FAIL = 0x80000008,
   /// 
   /// File not filtered due to password protection
   /// 
   FILTER_E_PASSWORD = 0x8004170B,
   /// 
   /// The document format is not recognised by the filter
   /// 
   FILTER_E_UNKNOWNFORMAT = 0x8004170C,
   /// 
   /// No text in current chunk
   /// 
   FILTER_E_NO_TEXT = 0x80041705,
   /// 
   /// No more chunks of text available in object
   /// 
   FILTER_E_END_OF_CHUNKS = 0x80041700,
   /// 
   /// No more text available in chunk
   /// 
   FILTER_E_NO_MORE_TEXT = 0x80041701,
   /// 
   /// No more property values available in chunk
   /// 
   FILTER_E_NO_MORE_VALUES = 0x80041702,
   /// 
   /// Unable to access object
   /// 
   FILTER_E_ACCESS = 0x80041703,
   /// 
   /// Moniker doesn't cover entire region
   /// 
   FILTER_W_MONIKER_CLIPPED = 0x00041704,
   /// 
   /// Unable to bind IFilter for embedded object
   /// 
   FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
   /// 
   /// Unable to bind IFilter for linked object
   /// 
   FILTER_E_LINK_UNAVAILABLE = 0x80041708,
   /// 
   /// This is the last text in the current chunk
   /// 
   FILTER_S_LAST_TEXT = 0x00041709,
   /// 
   /// This is the last value in the current chunk
   /// 
   FILTER_S_LAST_VALUES = 0x0004170A
}/// 
/// Convenience class which provides static methods to extract text from files using installed IFilters
/// 
public class DefaultParser
{
   public DefaultParser()
   {
   }   [DllImport("query.dll", CharSet = CharSet.Unicode)]
   private extern static int LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);   private static IFilter loadIFilter(string filename)
   {
    object outer = null;
    IFilter filter = null;    // Try to load the corresponding IFilter
    int resultLoad = LoadIFilter(filename,   outer, ref filter);
    if (resultLoad != (int) IFilterReturnCodes.S_OK)
    {
     return null;
    }
    return filter;
   }   public static bool IsParseable(string filename)
   {
    return loadIFilter(filename) != null;
   }   public static string Extract(string path)
   {
    StringBuilder sb = new StringBuilder();
    IFilter filter = null;    try
    {
     filter = loadIFilter(path);     if (filter == null)
      return String.Empty;     uint i = 0;
     STAT_CHUNK ps = new STAT_CHUNK();     IFILTER_INIT iflags =
      IFILTER_INIT.CANON_HYPHENS |
      IFILTER_INIT.CANON_PARAGRAPHS |
      IFILTER_INIT.CANON_SPACES |
      IFILTER_INIT.APPLY_CRAWL_ATTRIBUTES |
      IFILTER_INIT.APPLY_INDEX_ATTRIBUTES |
      IFILTER_INIT.APPLY_OTHER_ATTRIBUTES |
      IFILTER_INIT.HARD_LINE_BREAKS |
      IFILTER_INIT.SEARCH_LINKS |
      IFILTER_INIT.FILTER_OWNED_VALUE_OK;     if (filter.Init(iflags, 0, null, ref i) != (int) IFilterReturnCodes.S_OK)
      throw new Exception("Problem initializing an IFilter for:\n" + path + " \n\n");     while (filter.GetChunk(out ps) == (int) (IFilterReturnCodes.S_OK))
     {
      if (ps.flags == CHUNKSTATE.CHUNK_TEXT)
      {
       IFilterReturnCodes scode = 0;
       while (scode == IFilterReturnCodes.S_OK || scode == IFilterReturnCodes.FILTER_S_LAST_TEXT)
       {
        uint pcwcBuffer = 65536;
        System.Text.StringBuilder sbBuffer = new System.Text.StringBuilder((int)pcwcBuffer);        scode = (IFilterReturnCodes) filter.GetText(ref pcwcBuffer, sbBuffer);
        if (pcwcBuffer > 0 && sbBuffer.Length > 0)
        {
         if (sbBuffer.Length < pcwcBuffer) // Should never happen, but it happens !
          pcwcBuffer = (uint)sbBuffer.Length;         sb.Append(sbBuffer.ToString(0, (int) pcwcBuffer));
         sb.Append(" "); // "\r\n"
        }       }
      }     }
    }
    finally
    {
     if (filter != null)
      Marshal.ReleaseComObject(filter);
    }    return sb.ToString();
   }
}

作者:古道轻风,


标签:const,office,int,PID,FILTER,uint,pdf,文本,public
From: https://blog.51cto.com/chunyangi/6612844

相关文章

  • 【Python自制工具软件】批量图片转PDF小工具——PIC2PDF
    楔子大家在工作当中总会冒出各种各样的需求,尤其当面对繁琐的工作时。“如果有那样一款想象中的工具就好了!”可以瞬间解决手头工作的想象中的工具,是否存在呢?当然,然而获得它总是需要我们花费大量的时间去筛选甄别,有的充斥大量广告,有的则需要付出不菲的费用。其实完全可以自己上......
  • 使用 ABAP 调用 Adobe Document Service 生成 PDF 文档
    我以前在SAP成都研究院BYDForm开发团队工作过5年,负责BYDBO输出成PDF的功能开发。AdobeDocumentService(ADS)是SAPNetWeaverASJava堆栈的一部分,提供了用于创建和处理PDF文件的功能。在ABAP系统中,可以通过调用ADS服务来生成和处理PDF文档。这种集成使得ABAP开发人员能......
  • print-js 实现页面打印PDF,与样式缺失问题
    参考https://blog.csdn.net/qq_36990322/article/details/105786298(样式继承问题)https://blog.csdn.net/qq_42571665/article/details/127277049(宽度配置)说明样式缺失是因为默认不继承样式。环境软件/系统版本说明dayjs^1.6.0步骤安装yarnaddprin......
  • 文本格式数据读写
    数据载入、存储及文件格式文本格式数据的读写函数描述read_csv默认分隔符逗号read_table制表符(’\t‘)是默认分隔符read_excel从excel读取read_json从JSON字符串读取读取csv文件df=pd.read_csv('examples/ex1.csv')有些文件不包含表头,可以默认分......
  • WPS与OFFICE的差异
    WPS与OFFICE的差异会越来越大,此篇也不是为了把所有的差异都列出来,而是就一些容易被老板扔文件夹到头上的问题拎出来……IF函数Office中If函数第二/三参数光挂一个逗号时,返回的结果是0,内部的运算结果也是0;而WPS同样的公式,虽然单独IF的结果仍是0,但其实际计算为空。未完待续……......
  • uni.app 给大家推荐一个 非常牛的 插件 上传图片 上传 word 上传xlsx 上传pdf
     插件地址地址:https://ext.dcloud.net.cn/plugin?name=lsj-upload兼容vue2 vue3微信小程序 等等等等图片放在下面了使用方法里面有介绍又到了码农 最重要的时刻了ctrlc  ctrlv  大法   ......
  • 移动端 根据后端返回 的地址 使用 pdf 打开 浏览下载 uni.app app h5
     话不多说肝着肝了三天三夜 才搞明白 下面是经过压缩压缩再压缩减少减少再减少的代码简易易懂 移动端根据后端返回的地址使用pdf打开浏览下载 可以使用 插件 把这个插件直接拉到与pages同级目录下面就好了  插件地址  https://toscode.g......
  • ERP 软件领域的 frontend office 和 backend office
    在企业资源规划(ERP)软件领域中,前端(frontendoffice)和后端(backendoffice)通常指的是企业内部操作和管理的不同方面。它们之间的区别在于,前端主要关注与客户交互和销售相关的任务,而后端则涉及企业内部的运营和管理。在本文中,我们将详细介绍前端和后端办公的概念、功能和优势,以及它们......
  • Linux Shell文本处理
    预计更新1:基础知识简介和安装基本命令变量和环境变量2:流程控制条件语句循环语句函数3:文件处理文件读写文件权限和所有权文件搜索和替换4:网络和进程网络通信进程管理信号处理5:文本处理正则表达式文本分析和处理生成报告和日志6:用户界面命令行参数和选......
  • 未来的LibreOffice版本将允许导入和导出APNG图像
    导读文档基金会已经完成了其LibreOffice开源办公套件在2023年谷歌夏季代码(GSoC)中的项目选择,承诺将提供用户要求已久的新功能和改进。未来的LibreOffice版本将允许你导入和导出APNG(AnimatedPNG)图像,这是一种向后兼容PNG图像格式的动画格式,支持比GIF图像更多的颜色和部分透......