Efficient Train Data Collector for Anime Waifu.
This project is still under development, official version will be released soon afterwards.
If you need to use it immediately, just clone it and run pip install .
.
PyPI version is not ready now, please install waifuc with source code.
pip install git+https://github.com/deepghs/waifuc.git@main#egg=waifuc
If your operating environment includes available CUDA, you can use the following installation command to achieve higher
pip install git+https://github.com/deepghs/waifuc.git@main#egg=waifuc[gpu]
If you need to process with videos, you can install waifuc with
pip install git+https://github.com/deepghs/waifuc.git@main#egg=waifuc[video]
For more information about installation, you can refer to Installation.
Grab surtr (arknights)'s dataset for LoRA Training
from waifuc.action import NoMonochromeAction, FilterSimilarAction, \
TaggingAction, PaddingAlignAction, PersonSplitAction, FaceCountAction, FirstNSelectAction, \
CCIPAction, ModeConvertAction, ClassFilterAction, RandomFilenameAction, AlignMinSizeAction
from waifuc.export import TextualInversionExporter
from waifuc.source import GcharAutoSource
if __name__ == '__main__':
# data source for surtr in arknights, images from many sites will be crawled
# all supported games and sites can be found at
# https://narugo1992.github.io/gchar/main/best_practice/supported/index.html#supported-games-and-sites
# ATTENTION: GcharAutoSource required `git+https://github.com/deepghs/waifuc.git@main#egg=waifuc[gchar]`
s = GcharAutoSource('surtr')
# crawl images, process them, and then save them to directory with given format
s.attach(
# preprocess images with white background RGB
ModeConvertAction('RGB', 'white'),
# pre-filtering for images
NoMonochromeAction(), # no monochrome, greyscale or sketch
ClassFilterAction(['illustration', 'bangumi']), # no comic or 3d
# RatingFilterAction(['safe', 'r15']), # filter images with rating, like safe, r15, r18
FilterSimilarAction('all'), # filter duplicated images
# human processing
FaceCountAction(1), # drop images with 0 or >1 faces
PersonSplitAction(), # crop for each person
FaceCountAction(1),
# CCIP, filter the character you may not want to see in dataset
CCIPAction(min_val_count=15),
# if min(height, weight) > 800, resize it to 800
AlignMinSizeAction(800),
# tagging with wd14 v2, if you don't need character tag, set character_threshold=1.01
TaggingAction(force=True),
PaddingAlignAction((512, 512)), # align to 512x512
FilterSimilarAction('all'), # filter again
FirstNSelectAction(200), # first 200 images
# MirrorAction(), # mirror image for data augmentation
RandomFilenameAction(ext='.png'), # random rename files
).export(
# save to surtr_dataset directory
TextualInversionExporter('surtr_dataset')
)
The following code will give you 10 images of surtr (arknights) with metadata saved.
from waifuc.action import HeadCountAction, AlignMinSizeAction
from waifuc.export import SaveExporter
from waifuc.source import DanbooruSource
if __name__ == '__main__':
source = DanbooruSource(['surtr_(arknights)', 'solo'])
source.attach(
# only 1 head,
HeadCountAction(1),
# if shorter side is over 640, just resize it to 640
AlignMinSizeAction(640),
)[:10].export( # only first 10 images
# save images (with meta information from danbooru site)
SaveExporter('/data/surtr_arknights')
)
And this is what's in /data/surtr_arknights
afterwards
Similarly, you can crawl from pixiv with similar code, just by changing the source
from waifuc.action import HeadCountAction, AlignMinSizeAction, CCIPAction
from waifuc.export import SaveExporter
from waifuc.source import PixivSearchSource
if __name__ == '__main__':
source = PixivSearchSource(
'アークナイツ (surtr OR スルト OR 史尔特尔)',
refresh_token='use_your_own_refresh_token'
)
source.attach(
# only 1 head,
HeadCountAction(1),
# pixiv often have some irrelevant character mixed in
# so CCIPAction is necessary here to drop these images
CCIPAction(),
# if shorter side is over 640, just resize it to 640
AlignMinSizeAction(640),
)[:10].export( # only first 10 images
# save images (with meta information from danbooru site)
SaveExporter('/data/surtr_arknights_pixiv')
)
This is what you can get at /data/surtr_arknights_pixiv
Here is a list of website source we currently supported
Name | Import Statement |
---|---|
ATFBooruSource | from waifuc.source import ATFBooruSource |
AnimePicturesSource | from waifuc.source import AnimePicturesSource |
DanbooruSource | from waifuc.source import DanbooruSource |
DerpibooruSource | from waifuc.source import DerpibooruSource |
DuitangSource | from waifuc.source import DuitangSource |
E621Source | from waifuc.source import E621Source |
E926Source | from waifuc.source import E926Source |
FurbooruSource | from waifuc.source import FurbooruSource |
GelbooruSource | from waifuc.source import GelbooruSource |
Huashi6Source | from waifuc.source import Huashi6Source |
HypnoHubSource | from waifuc.source import HypnoHubSource |
KonachanNetSource | from waifuc.source import KonachanNetSource |
KonachanSource | from waifuc.source import KonachanSource |
LolibooruSource | from waifuc.source import LolibooruSource |
PahealSource | from waifuc.source import PahealSource |
PixivRankingSource | from waifuc.source import PixivRankingSource |
PixivSearchSource | from waifuc.source import PixivSearchSource |
PixivUserSource | from waifuc.source import PixivUserSource |
Rule34Source | from waifuc.source import Rule34Source |
SafebooruOrgSource | from waifuc.source import SafebooruOrgSource |
SafebooruSource | from waifuc.source import SafebooruSource |
SankakuSource | from waifuc.source import SankakuSource |
TBIBSource | from waifuc.source import TBIBSource |
WallHavenSource | from waifuc.source import WallHavenSource |
XbooruSource | from waifuc.source import XbooruSource |
YandeSource | from waifuc.source import YandeSource |
ZerochanSource | from waifuc.source import ZerochanSource |
This code is loading images from local directory, and crop the images with 3-stage-cropping method (head, halfbody, full person), and then save it to another local directory.
from waifuc.action import ThreeStageSplitAction
from waifuc.export import SaveExporter
from waifuc.source import LocalSource
source = LocalSource('/your/path/contains/images')
source.attach(
ThreeStageSplitAction(),
).export(SaveExporter('/your/output/path'))